1 /* ***** BEGIN LICENSE BLOCK *****
2 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
4 * The contents of this file are subject to the Mozilla Public License Version
5 * 1.1 (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 * http://www.mozilla.org/MPL/
9 * Software distributed under the License is distributed on an "AS IS" basis,
10 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 * for the specific language governing rights and limitations under the
14 * The Original Code is the Netscape security libraries.
16 * The Initial Developer of the Original Code is
17 * Netscape Communications Corporation.
18 * Portions created by the Initial Developer are Copyright (C) 1994-2000
19 * the Initial Developer. All Rights Reserved.
22 * John Gardiner Myers <jgmyers@speakeasy.net>
24 * Alternatively, the contents of this file may be used under the terms of
25 * either the GNU General Public License Version 2 or later (the "GPL"), or
26 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
27 * in which case the provisions of the GPL or the LGPL are applicable instead
28 * of those above. If you wish to allow use of your version of this file only
29 * under the terms of either the GPL or the LGPL, and not to allow others to
30 * use your version of this file under the terms of the MPL, indicate your
31 * decision by deleting the provisions above and replace them with the notice
32 * and other provisions required by the GPL or the LGPL. If you do not delete
33 * the provisions above, a recipient may use your version of this file under
34 * the terms of any one of the MPL, the GPL or the LGPL.
36 * ***** END LICENSE BLOCK ***** */
39 static const char CVS_ID
[] = "@(#) $RCSfile: utf8.c,v $ $Revision: 1.13 $ $Date: 2008/10/05 20:59:26 $";
48 #define PORT_Assert assert
54 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
55 * 0000 0000-0000 007F 0xxxxxxx
56 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
57 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
58 * 0001 0000-001F FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
59 * 0020 0000-03FF FFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
60 * 0400 0000-7FFF FFFF 1111110x 10xxxxxx ... 10xxxxxx
64 * From http://www.imc.org/draft-hoffman-utf16
66 * For U on [0x00010000,0x0010FFFF]: Let U' = U - 0x00010000
68 * U' = yyyyyyyyyyxxxxxxxxxx
69 * W1 = 110110yyyyyyyyyy
70 * W2 = 110111xxxxxxxxxx
74 * This code is assuming NETWORK BYTE ORDER for the 16- and 32-bit
75 * character values. If you wish to use this code for working with
76 * host byte order values, define the following:
85 * #else / * not everyone has elif * /
86 * #if IS_LITTLE_ENDIAN
94 * #error "PDP and NUXI support deferred"
95 * #endif / * IS_LITTLE_ENDIAN * /
96 * #endif / * IS_BIG_ENDIAN * /
106 #define BAD_UTF8 ((PRUint32)-1)
109 * Parse a single UTF-8 character per the spec. in section 3.9 (D36)
113 * index - Points to the byte offset in inBuf of character to read. On success,
114 * updated to the offset of the following character.
115 * inBuf - Input buffer, UTF-8 encoded
116 * inbufLen - Length of input buffer, in bytes.
119 * Success - The UCS4 encoded character
123 sec_port_read_utf8(unsigned int *index
, unsigned char *inBuf
, unsigned int inBufLen
)
126 unsigned int i
= *index
;
130 PORT_Assert(i
< inBufLen
);
132 if ( (inBuf
[i
] & 0x80) == 0x00 ) {
136 } else if ( (inBuf
[i
] & 0xE0) == 0xC0 ) {
137 result
= inBuf
[i
++] & 0x1F;
140 } else if ( (inBuf
[i
] & 0xF0) == 0xE0) {
141 result
= inBuf
[i
++] & 0x0F;
144 } else if ( (inBuf
[i
] & 0xF8) == 0xF0) {
145 result
= inBuf
[i
++] & 0x07;
152 while (bytes_left
--) {
153 if (i
>= inBufLen
|| (inBuf
[i
] & 0xC0) != 0x80) return BAD_UTF8
;
154 result
= (result
<< 6) | (inBuf
[i
++] & 0x3F);
157 /* Check for overlong sequences, surrogates, and outside unicode range */
158 if (result
< min_value
|| (result
& 0xFFFFF800) == 0xD800 || result
> 0x10FFFF) {
167 sec_port_ucs4_utf8_conversion_function
170 unsigned char *inBuf
,
171 unsigned int inBufLen
,
172 unsigned char *outBuf
,
173 unsigned int maxOutBufLen
,
174 unsigned int *outBufLen
177 PORT_Assert((unsigned int *)NULL
!= outBufLen
);
180 unsigned int i
, len
= 0;
182 for( i
= 0; i
< inBufLen
; ) {
183 if( (inBuf
[i
] & 0x80) == 0x00 ) i
+= 1;
184 else if( (inBuf
[i
] & 0xE0) == 0xC0 ) i
+= 2;
185 else if( (inBuf
[i
] & 0xF0) == 0xE0 ) i
+= 3;
186 else if( (inBuf
[i
] & 0xF8) == 0xF0 ) i
+= 4;
187 else return PR_FALSE
;
192 if( len
> maxOutBufLen
) {
199 for( i
= 0; i
< inBufLen
; ) {
200 PRUint32 ucs4
= sec_port_read_utf8(&i
, inBuf
, inBufLen
);
202 if (ucs4
== BAD_UTF8
) return PR_FALSE
;
204 outBuf
[len
+L_0
] = 0x00;
205 outBuf
[len
+L_1
] = (unsigned char)(ucs4
>> 16);
206 outBuf
[len
+L_2
] = (unsigned char)(ucs4
>> 8);
207 outBuf
[len
+L_3
] = (unsigned char)ucs4
;
215 unsigned int i
, len
= 0;
216 PORT_Assert((inBufLen
% 4) == 0);
217 if ((inBufLen
% 4) != 0) {
222 for( i
= 0; i
< inBufLen
; i
+= 4 ) {
223 if( (inBuf
[i
+L_0
] > 0x00) || (inBuf
[i
+L_1
] > 0x10) ) {
226 } else if( inBuf
[i
+L_1
] >= 0x01 ) len
+= 4;
227 else if( inBuf
[i
+L_2
] >= 0x08 ) len
+= 3;
228 else if( (inBuf
[i
+L_2
] > 0x00) || (inBuf
[i
+L_3
] >= 0x80) ) len
+= 2;
232 if( len
> maxOutBufLen
) {
239 for( i
= 0; i
< inBufLen
; i
+= 4 ) {
240 if( inBuf
[i
+L_1
] >= 0x01 ) {
241 /* 0001 0000-001F FFFF -> 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
242 /* 00000000 000abcde fghijklm nopqrstu ->
243 11110abc 10defghi 10jklmno 10pqrstu */
245 outBuf
[len
+0] = 0xF0 | ((inBuf
[i
+L_1
] & 0x1C) >> 2);
246 outBuf
[len
+1] = 0x80 | ((inBuf
[i
+L_1
] & 0x03) << 4)
247 | ((inBuf
[i
+L_2
] & 0xF0) >> 4);
248 outBuf
[len
+2] = 0x80 | ((inBuf
[i
+L_2
] & 0x0F) << 2)
249 | ((inBuf
[i
+L_3
] & 0xC0) >> 6);
250 outBuf
[len
+3] = 0x80 | ((inBuf
[i
+L_3
] & 0x3F) >> 0);
253 } else if( inBuf
[i
+L_2
] >= 0x08 ) {
254 /* 0000 0800-0000 FFFF -> 1110xxxx 10xxxxxx 10xxxxxx */
255 /* 00000000 00000000 abcdefgh ijklmnop ->
256 1110abcd 10efghij 10klmnop */
258 outBuf
[len
+0] = 0xE0 | ((inBuf
[i
+L_2
] & 0xF0) >> 4);
259 outBuf
[len
+1] = 0x80 | ((inBuf
[i
+L_2
] & 0x0F) << 2)
260 | ((inBuf
[i
+L_3
] & 0xC0) >> 6);
261 outBuf
[len
+2] = 0x80 | ((inBuf
[i
+L_3
] & 0x3F) >> 0);
264 } else if( (inBuf
[i
+L_2
] > 0x00) || (inBuf
[i
+L_3
] >= 0x80) ) {
265 /* 0000 0080-0000 07FF -> 110xxxxx 10xxxxxx */
266 /* 00000000 00000000 00000abc defghijk ->
269 outBuf
[len
+0] = 0xC0 | ((inBuf
[i
+L_2
] & 0x07) << 2)
270 | ((inBuf
[i
+L_3
] & 0xC0) >> 6);
271 outBuf
[len
+1] = 0x80 | ((inBuf
[i
+L_3
] & 0x3F) >> 0);
275 /* 0000 0000-0000 007F -> 0xxxxxx */
276 /* 00000000 00000000 00000000 0abcdefg ->
279 outBuf
[len
+0] = (inBuf
[i
+L_3
] & 0x7F);
291 sec_port_ucs2_utf8_conversion_function
294 unsigned char *inBuf
,
295 unsigned int inBufLen
,
296 unsigned char *outBuf
,
297 unsigned int maxOutBufLen
,
298 unsigned int *outBufLen
301 PORT_Assert((unsigned int *)NULL
!= outBufLen
);
304 unsigned int i
, len
= 0;
306 for( i
= 0; i
< inBufLen
; ) {
307 if( (inBuf
[i
] & 0x80) == 0x00 ) {
310 } else if( (inBuf
[i
] & 0xE0) == 0xC0 ) {
313 } else if( (inBuf
[i
] & 0xF0) == 0xE0 ) {
316 } else if( (inBuf
[i
] & 0xF8) == 0xF0 ) {
319 } else return PR_FALSE
;
322 if( len
> maxOutBufLen
) {
329 for( i
= 0; i
< inBufLen
; ) {
330 PRUint32 ucs4
= sec_port_read_utf8(&i
, inBuf
, inBufLen
);
332 if (ucs4
== BAD_UTF8
) return PR_FALSE
;
334 if( ucs4
< 0x10000) {
335 outBuf
[len
+H_0
] = (unsigned char)(ucs4
>> 8);
336 outBuf
[len
+H_1
] = (unsigned char)ucs4
;
340 outBuf
[len
+0+H_0
] = (unsigned char)(0xD8 | ((ucs4
>> 18) & 0x3));
341 outBuf
[len
+0+H_1
] = (unsigned char)(ucs4
>> 10);
342 outBuf
[len
+2+H_0
] = (unsigned char)(0xDC | ((ucs4
>> 8) & 0x3));
343 outBuf
[len
+2+H_1
] = (unsigned char)ucs4
;
351 unsigned int i
, len
= 0;
352 PORT_Assert((inBufLen
% 2) == 0);
353 if ((inBufLen
% 2) != 0) {
358 for( i
= 0; i
< inBufLen
; i
+= 2 ) {
359 if( (inBuf
[i
+H_0
] == 0x00) && ((inBuf
[i
+H_0
] & 0x80) == 0x00) ) len
+= 1;
360 else if( inBuf
[i
+H_0
] < 0x08 ) len
+= 2;
361 else if( ((inBuf
[i
+0+H_0
] & 0xDC) == 0xD8) ) {
362 if( ((inBuf
[i
+2+H_0
] & 0xDC) == 0xDC) && ((inBufLen
- i
) > 2) ) {
372 if( len
> maxOutBufLen
) {
379 for( i
= 0; i
< inBufLen
; i
+= 2 ) {
380 if( (inBuf
[i
+H_0
] == 0x00) && ((inBuf
[i
+H_1
] & 0x80) == 0x00) ) {
381 /* 0000-007F -> 0xxxxxx */
382 /* 00000000 0abcdefg -> 0abcdefg */
384 outBuf
[len
] = inBuf
[i
+H_1
] & 0x7F;
387 } else if( inBuf
[i
+H_0
] < 0x08 ) {
388 /* 0080-07FF -> 110xxxxx 10xxxxxx */
389 /* 00000abc defghijk -> 110abcde 10fghijk */
391 outBuf
[len
+0] = 0xC0 | ((inBuf
[i
+H_0
] & 0x07) << 2)
392 | ((inBuf
[i
+H_1
] & 0xC0) >> 6);
393 outBuf
[len
+1] = 0x80 | ((inBuf
[i
+H_1
] & 0x3F) >> 0);
396 } else if( (inBuf
[i
+H_0
] & 0xDC) == 0xD8 ) {
399 PORT_Assert(((inBuf
[i
+2+H_0
] & 0xDC) == 0xDC) && ((inBufLen
- i
) > 2));
401 /* D800-DBFF DC00-DFFF -> 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
402 /* 110110BC DEfghijk 110111lm nopqrstu ->
403 { Let abcde = BCDE + 1 }
404 11110abc 10defghi 10jklmno 10pqrstu */
406 BCDE
= ((inBuf
[i
+H_0
] & 0x03) << 2) | ((inBuf
[i
+H_1
] & 0xC0) >> 6);
409 outBuf
[len
+0] = 0xF0 | ((abcde
& 0x1C) >> 2);
410 outBuf
[len
+1] = 0x80 | ((abcde
& 0x03) << 4)
411 | ((inBuf
[i
+0+H_1
] & 0x3C) >> 2);
412 outBuf
[len
+2] = 0x80 | ((inBuf
[i
+0+H_1
] & 0x03) << 4)
413 | ((inBuf
[i
+2+H_0
] & 0x03) << 2)
414 | ((inBuf
[i
+2+H_1
] & 0xC0) >> 6);
415 outBuf
[len
+3] = 0x80 | ((inBuf
[i
+2+H_1
] & 0x3F) >> 0);
420 /* 0800-FFFF -> 1110xxxx 10xxxxxx 10xxxxxx */
421 /* abcdefgh ijklmnop -> 1110abcd 10efghij 10klmnop */
423 outBuf
[len
+0] = 0xE0 | ((inBuf
[i
+H_0
] & 0xF0) >> 4);
424 outBuf
[len
+1] = 0x80 | ((inBuf
[i
+H_0
] & 0x0F) << 2)
425 | ((inBuf
[i
+H_1
] & 0xC0) >> 6);
426 outBuf
[len
+2] = 0x80 | ((inBuf
[i
+H_1
] & 0x3F) >> 0);
438 sec_port_iso88591_utf8_conversion_function
440 const unsigned char *inBuf
,
441 unsigned int inBufLen
,
442 unsigned char *outBuf
,
443 unsigned int maxOutBufLen
,
444 unsigned int *outBufLen
447 unsigned int i
, len
= 0;
449 PORT_Assert((unsigned int *)NULL
!= outBufLen
);
451 for( i
= 0; i
< inBufLen
; i
++) {
452 if( (inBuf
[i
] & 0x80) == 0x00 ) len
+= 1;
456 if( len
> maxOutBufLen
) {
463 for( i
= 0; i
< inBufLen
; i
++) {
464 if( (inBuf
[i
] & 0x80) == 0x00 ) {
465 /* 00-7F -> 0xxxxxxx */
466 /* 0abcdefg -> 0abcdefg */
468 outBuf
[len
] = inBuf
[i
];
471 /* 80-FF <- 110xxxxx 10xxxxxx */
472 /* 00000000 abcdefgh -> 110000ab 10cdefgh */
474 outBuf
[len
+0] = 0xC0 | ((inBuf
[i
] & 0xC0) >> 6);
475 outBuf
[len
+1] = 0x80 | ((inBuf
[i
] & 0x3F) >> 0);
490 #include <netinet/in.h> /* for htonl and htons */
524 struct ucs4 ucs4
[] = {
525 { 0x00000001, "\x01" },
526 { 0x00000002, "\x02" },
527 { 0x00000003, "\x03" },
528 { 0x00000004, "\x04" },
529 { 0x00000007, "\x07" },
530 { 0x00000008, "\x08" },
531 { 0x0000000F, "\x0F" },
532 { 0x00000010, "\x10" },
533 { 0x0000001F, "\x1F" },
534 { 0x00000020, "\x20" },
535 { 0x0000003F, "\x3F" },
536 { 0x00000040, "\x40" },
537 { 0x0000007F, "\x7F" },
539 { 0x00000080, "\xC2\x80" },
540 { 0x00000081, "\xC2\x81" },
541 { 0x00000082, "\xC2\x82" },
542 { 0x00000084, "\xC2\x84" },
543 { 0x00000088, "\xC2\x88" },
544 { 0x00000090, "\xC2\x90" },
545 { 0x000000A0, "\xC2\xA0" },
546 { 0x000000C0, "\xC3\x80" },
547 { 0x000000FF, "\xC3\xBF" },
548 { 0x00000100, "\xC4\x80" },
549 { 0x00000101, "\xC4\x81" },
550 { 0x00000102, "\xC4\x82" },
551 { 0x00000104, "\xC4\x84" },
552 { 0x00000108, "\xC4\x88" },
553 { 0x00000110, "\xC4\x90" },
554 { 0x00000120, "\xC4\xA0" },
555 { 0x00000140, "\xC5\x80" },
556 { 0x00000180, "\xC6\x80" },
557 { 0x000001FF, "\xC7\xBF" },
558 { 0x00000200, "\xC8\x80" },
559 { 0x00000201, "\xC8\x81" },
560 { 0x00000202, "\xC8\x82" },
561 { 0x00000204, "\xC8\x84" },
562 { 0x00000208, "\xC8\x88" },
563 { 0x00000210, "\xC8\x90" },
564 { 0x00000220, "\xC8\xA0" },
565 { 0x00000240, "\xC9\x80" },
566 { 0x00000280, "\xCA\x80" },
567 { 0x00000300, "\xCC\x80" },
568 { 0x000003FF, "\xCF\xBF" },
569 { 0x00000400, "\xD0\x80" },
570 { 0x00000401, "\xD0\x81" },
571 { 0x00000402, "\xD0\x82" },
572 { 0x00000404, "\xD0\x84" },
573 { 0x00000408, "\xD0\x88" },
574 { 0x00000410, "\xD0\x90" },
575 { 0x00000420, "\xD0\xA0" },
576 { 0x00000440, "\xD1\x80" },
577 { 0x00000480, "\xD2\x80" },
578 { 0x00000500, "\xD4\x80" },
579 { 0x00000600, "\xD8\x80" },
580 { 0x000007FF, "\xDF\xBF" },
582 { 0x00000800, "\xE0\xA0\x80" },
583 { 0x00000801, "\xE0\xA0\x81" },
584 { 0x00000802, "\xE0\xA0\x82" },
585 { 0x00000804, "\xE0\xA0\x84" },
586 { 0x00000808, "\xE0\xA0\x88" },
587 { 0x00000810, "\xE0\xA0\x90" },
588 { 0x00000820, "\xE0\xA0\xA0" },
589 { 0x00000840, "\xE0\xA1\x80" },
590 { 0x00000880, "\xE0\xA2\x80" },
591 { 0x00000900, "\xE0\xA4\x80" },
592 { 0x00000A00, "\xE0\xA8\x80" },
593 { 0x00000C00, "\xE0\xB0\x80" },
594 { 0x00000FFF, "\xE0\xBF\xBF" },
595 { 0x00001000, "\xE1\x80\x80" },
596 { 0x00001001, "\xE1\x80\x81" },
597 { 0x00001002, "\xE1\x80\x82" },
598 { 0x00001004, "\xE1\x80\x84" },
599 { 0x00001008, "\xE1\x80\x88" },
600 { 0x00001010, "\xE1\x80\x90" },
601 { 0x00001020, "\xE1\x80\xA0" },
602 { 0x00001040, "\xE1\x81\x80" },
603 { 0x00001080, "\xE1\x82\x80" },
604 { 0x00001100, "\xE1\x84\x80" },
605 { 0x00001200, "\xE1\x88\x80" },
606 { 0x00001400, "\xE1\x90\x80" },
607 { 0x00001800, "\xE1\xA0\x80" },
608 { 0x00001FFF, "\xE1\xBF\xBF" },
609 { 0x00002000, "\xE2\x80\x80" },
610 { 0x00002001, "\xE2\x80\x81" },
611 { 0x00002002, "\xE2\x80\x82" },
612 { 0x00002004, "\xE2\x80\x84" },
613 { 0x00002008, "\xE2\x80\x88" },
614 { 0x00002010, "\xE2\x80\x90" },
615 { 0x00002020, "\xE2\x80\xA0" },
616 { 0x00002040, "\xE2\x81\x80" },
617 { 0x00002080, "\xE2\x82\x80" },
618 { 0x00002100, "\xE2\x84\x80" },
619 { 0x00002200, "\xE2\x88\x80" },
620 { 0x00002400, "\xE2\x90\x80" },
621 { 0x00002800, "\xE2\xA0\x80" },
622 { 0x00003000, "\xE3\x80\x80" },
623 { 0x00003FFF, "\xE3\xBF\xBF" },
624 { 0x00004000, "\xE4\x80\x80" },
625 { 0x00004001, "\xE4\x80\x81" },
626 { 0x00004002, "\xE4\x80\x82" },
627 { 0x00004004, "\xE4\x80\x84" },
628 { 0x00004008, "\xE4\x80\x88" },
629 { 0x00004010, "\xE4\x80\x90" },
630 { 0x00004020, "\xE4\x80\xA0" },
631 { 0x00004040, "\xE4\x81\x80" },
632 { 0x00004080, "\xE4\x82\x80" },
633 { 0x00004100, "\xE4\x84\x80" },
634 { 0x00004200, "\xE4\x88\x80" },
635 { 0x00004400, "\xE4\x90\x80" },
636 { 0x00004800, "\xE4\xA0\x80" },
637 { 0x00005000, "\xE5\x80\x80" },
638 { 0x00006000, "\xE6\x80\x80" },
639 { 0x00007FFF, "\xE7\xBF\xBF" },
640 { 0x00008000, "\xE8\x80\x80" },
641 { 0x00008001, "\xE8\x80\x81" },
642 { 0x00008002, "\xE8\x80\x82" },
643 { 0x00008004, "\xE8\x80\x84" },
644 { 0x00008008, "\xE8\x80\x88" },
645 { 0x00008010, "\xE8\x80\x90" },
646 { 0x00008020, "\xE8\x80\xA0" },
647 { 0x00008040, "\xE8\x81\x80" },
648 { 0x00008080, "\xE8\x82\x80" },
649 { 0x00008100, "\xE8\x84\x80" },
650 { 0x00008200, "\xE8\x88\x80" },
651 { 0x00008400, "\xE8\x90\x80" },
652 { 0x00008800, "\xE8\xA0\x80" },
653 { 0x00009000, "\xE9\x80\x80" },
654 { 0x0000A000, "\xEA\x80\x80" },
655 { 0x0000C000, "\xEC\x80\x80" },
656 { 0x0000FFFF, "\xEF\xBF\xBF" },
658 { 0x00010000, "\xF0\x90\x80\x80" },
659 { 0x00010001, "\xF0\x90\x80\x81" },
660 { 0x00010002, "\xF0\x90\x80\x82" },
661 { 0x00010004, "\xF0\x90\x80\x84" },
662 { 0x00010008, "\xF0\x90\x80\x88" },
663 { 0x00010010, "\xF0\x90\x80\x90" },
664 { 0x00010020, "\xF0\x90\x80\xA0" },
665 { 0x00010040, "\xF0\x90\x81\x80" },
666 { 0x00010080, "\xF0\x90\x82\x80" },
667 { 0x00010100, "\xF0\x90\x84\x80" },
668 { 0x00010200, "\xF0\x90\x88\x80" },
669 { 0x00010400, "\xF0\x90\x90\x80" },
670 { 0x00010800, "\xF0\x90\xA0\x80" },
671 { 0x00011000, "\xF0\x91\x80\x80" },
672 { 0x00012000, "\xF0\x92\x80\x80" },
673 { 0x00014000, "\xF0\x94\x80\x80" },
674 { 0x00018000, "\xF0\x98\x80\x80" },
675 { 0x0001FFFF, "\xF0\x9F\xBF\xBF" },
676 { 0x00020000, "\xF0\xA0\x80\x80" },
677 { 0x00020001, "\xF0\xA0\x80\x81" },
678 { 0x00020002, "\xF0\xA0\x80\x82" },
679 { 0x00020004, "\xF0\xA0\x80\x84" },
680 { 0x00020008, "\xF0\xA0\x80\x88" },
681 { 0x00020010, "\xF0\xA0\x80\x90" },
682 { 0x00020020, "\xF0\xA0\x80\xA0" },
683 { 0x00020040, "\xF0\xA0\x81\x80" },
684 { 0x00020080, "\xF0\xA0\x82\x80" },
685 { 0x00020100, "\xF0\xA0\x84\x80" },
686 { 0x00020200, "\xF0\xA0\x88\x80" },
687 { 0x00020400, "\xF0\xA0\x90\x80" },
688 { 0x00020800, "\xF0\xA0\xA0\x80" },
689 { 0x00021000, "\xF0\xA1\x80\x80" },
690 { 0x00022000, "\xF0\xA2\x80\x80" },
691 { 0x00024000, "\xF0\xA4\x80\x80" },
692 { 0x00028000, "\xF0\xA8\x80\x80" },
693 { 0x00030000, "\xF0\xB0\x80\x80" },
694 { 0x0003FFFF, "\xF0\xBF\xBF\xBF" },
695 { 0x00040000, "\xF1\x80\x80\x80" },
696 { 0x00040001, "\xF1\x80\x80\x81" },
697 { 0x00040002, "\xF1\x80\x80\x82" },
698 { 0x00040004, "\xF1\x80\x80\x84" },
699 { 0x00040008, "\xF1\x80\x80\x88" },
700 { 0x00040010, "\xF1\x80\x80\x90" },
701 { 0x00040020, "\xF1\x80\x80\xA0" },
702 { 0x00040040, "\xF1\x80\x81\x80" },
703 { 0x00040080, "\xF1\x80\x82\x80" },
704 { 0x00040100, "\xF1\x80\x84\x80" },
705 { 0x00040200, "\xF1\x80\x88\x80" },
706 { 0x00040400, "\xF1\x80\x90\x80" },
707 { 0x00040800, "\xF1\x80\xA0\x80" },
708 { 0x00041000, "\xF1\x81\x80\x80" },
709 { 0x00042000, "\xF1\x82\x80\x80" },
710 { 0x00044000, "\xF1\x84\x80\x80" },
711 { 0x00048000, "\xF1\x88\x80\x80" },
712 { 0x00050000, "\xF1\x90\x80\x80" },
713 { 0x00060000, "\xF1\xA0\x80\x80" },
714 { 0x0007FFFF, "\xF1\xBF\xBF\xBF" },
715 { 0x00080000, "\xF2\x80\x80\x80" },
716 { 0x00080001, "\xF2\x80\x80\x81" },
717 { 0x00080002, "\xF2\x80\x80\x82" },
718 { 0x00080004, "\xF2\x80\x80\x84" },
719 { 0x00080008, "\xF2\x80\x80\x88" },
720 { 0x00080010, "\xF2\x80\x80\x90" },
721 { 0x00080020, "\xF2\x80\x80\xA0" },
722 { 0x00080040, "\xF2\x80\x81\x80" },
723 { 0x00080080, "\xF2\x80\x82\x80" },
724 { 0x00080100, "\xF2\x80\x84\x80" },
725 { 0x00080200, "\xF2\x80\x88\x80" },
726 { 0x00080400, "\xF2\x80\x90\x80" },
727 { 0x00080800, "\xF2\x80\xA0\x80" },
728 { 0x00081000, "\xF2\x81\x80\x80" },
729 { 0x00082000, "\xF2\x82\x80\x80" },
730 { 0x00084000, "\xF2\x84\x80\x80" },
731 { 0x00088000, "\xF2\x88\x80\x80" },
732 { 0x00090000, "\xF2\x90\x80\x80" },
733 { 0x000A0000, "\xF2\xA0\x80\x80" },
734 { 0x000C0000, "\xF3\x80\x80\x80" },
735 { 0x000FFFFF, "\xF3\xBF\xBF\xBF" },
736 { 0x00100000, "\xF4\x80\x80\x80" },
737 { 0x00100001, "\xF4\x80\x80\x81" },
738 { 0x00100002, "\xF4\x80\x80\x82" },
739 { 0x00100004, "\xF4\x80\x80\x84" },
740 { 0x00100008, "\xF4\x80\x80\x88" },
741 { 0x00100010, "\xF4\x80\x80\x90" },
742 { 0x00100020, "\xF4\x80\x80\xA0" },
743 { 0x00100040, "\xF4\x80\x81\x80" },
744 { 0x00100080, "\xF4\x80\x82\x80" },
745 { 0x00100100, "\xF4\x80\x84\x80" },
746 { 0x00100200, "\xF4\x80\x88\x80" },
747 { 0x00100400, "\xF4\x80\x90\x80" },
748 { 0x00100800, "\xF4\x80\xA0\x80" },
749 { 0x00101000, "\xF4\x81\x80\x80" },
750 { 0x00102000, "\xF4\x82\x80\x80" },
751 { 0x00104000, "\xF4\x84\x80\x80" },
752 { 0x00108000, "\xF4\x88\x80\x80" },
753 { 0x0010FFFF, "\xF4\x8F\xBF\xBF" },
760 struct ucs2 ucs2
[] = {
775 { 0x0080, "\xC2\x80" },
776 { 0x0081, "\xC2\x81" },
777 { 0x0082, "\xC2\x82" },
778 { 0x0084, "\xC2\x84" },
779 { 0x0088, "\xC2\x88" },
780 { 0x0090, "\xC2\x90" },
781 { 0x00A0, "\xC2\xA0" },
782 { 0x00C0, "\xC3\x80" },
783 { 0x00FF, "\xC3\xBF" },
784 { 0x0100, "\xC4\x80" },
785 { 0x0101, "\xC4\x81" },
786 { 0x0102, "\xC4\x82" },
787 { 0x0104, "\xC4\x84" },
788 { 0x0108, "\xC4\x88" },
789 { 0x0110, "\xC4\x90" },
790 { 0x0120, "\xC4\xA0" },
791 { 0x0140, "\xC5\x80" },
792 { 0x0180, "\xC6\x80" },
793 { 0x01FF, "\xC7\xBF" },
794 { 0x0200, "\xC8\x80" },
795 { 0x0201, "\xC8\x81" },
796 { 0x0202, "\xC8\x82" },
797 { 0x0204, "\xC8\x84" },
798 { 0x0208, "\xC8\x88" },
799 { 0x0210, "\xC8\x90" },
800 { 0x0220, "\xC8\xA0" },
801 { 0x0240, "\xC9\x80" },
802 { 0x0280, "\xCA\x80" },
803 { 0x0300, "\xCC\x80" },
804 { 0x03FF, "\xCF\xBF" },
805 { 0x0400, "\xD0\x80" },
806 { 0x0401, "\xD0\x81" },
807 { 0x0402, "\xD0\x82" },
808 { 0x0404, "\xD0\x84" },
809 { 0x0408, "\xD0\x88" },
810 { 0x0410, "\xD0\x90" },
811 { 0x0420, "\xD0\xA0" },
812 { 0x0440, "\xD1\x80" },
813 { 0x0480, "\xD2\x80" },
814 { 0x0500, "\xD4\x80" },
815 { 0x0600, "\xD8\x80" },
816 { 0x07FF, "\xDF\xBF" },
818 { 0x0800, "\xE0\xA0\x80" },
819 { 0x0801, "\xE0\xA0\x81" },
820 { 0x0802, "\xE0\xA0\x82" },
821 { 0x0804, "\xE0\xA0\x84" },
822 { 0x0808, "\xE0\xA0\x88" },
823 { 0x0810, "\xE0\xA0\x90" },
824 { 0x0820, "\xE0\xA0\xA0" },
825 { 0x0840, "\xE0\xA1\x80" },
826 { 0x0880, "\xE0\xA2\x80" },
827 { 0x0900, "\xE0\xA4\x80" },
828 { 0x0A00, "\xE0\xA8\x80" },
829 { 0x0C00, "\xE0\xB0\x80" },
830 { 0x0FFF, "\xE0\xBF\xBF" },
831 { 0x1000, "\xE1\x80\x80" },
832 { 0x1001, "\xE1\x80\x81" },
833 { 0x1002, "\xE1\x80\x82" },
834 { 0x1004, "\xE1\x80\x84" },
835 { 0x1008, "\xE1\x80\x88" },
836 { 0x1010, "\xE1\x80\x90" },
837 { 0x1020, "\xE1\x80\xA0" },
838 { 0x1040, "\xE1\x81\x80" },
839 { 0x1080, "\xE1\x82\x80" },
840 { 0x1100, "\xE1\x84\x80" },
841 { 0x1200, "\xE1\x88\x80" },
842 { 0x1400, "\xE1\x90\x80" },
843 { 0x1800, "\xE1\xA0\x80" },
844 { 0x1FFF, "\xE1\xBF\xBF" },
845 { 0x2000, "\xE2\x80\x80" },
846 { 0x2001, "\xE2\x80\x81" },
847 { 0x2002, "\xE2\x80\x82" },
848 { 0x2004, "\xE2\x80\x84" },
849 { 0x2008, "\xE2\x80\x88" },
850 { 0x2010, "\xE2\x80\x90" },
851 { 0x2020, "\xE2\x80\xA0" },
852 { 0x2040, "\xE2\x81\x80" },
853 { 0x2080, "\xE2\x82\x80" },
854 { 0x2100, "\xE2\x84\x80" },
855 { 0x2200, "\xE2\x88\x80" },
856 { 0x2400, "\xE2\x90\x80" },
857 { 0x2800, "\xE2\xA0\x80" },
858 { 0x3000, "\xE3\x80\x80" },
859 { 0x3FFF, "\xE3\xBF\xBF" },
860 { 0x4000, "\xE4\x80\x80" },
861 { 0x4001, "\xE4\x80\x81" },
862 { 0x4002, "\xE4\x80\x82" },
863 { 0x4004, "\xE4\x80\x84" },
864 { 0x4008, "\xE4\x80\x88" },
865 { 0x4010, "\xE4\x80\x90" },
866 { 0x4020, "\xE4\x80\xA0" },
867 { 0x4040, "\xE4\x81\x80" },
868 { 0x4080, "\xE4\x82\x80" },
869 { 0x4100, "\xE4\x84\x80" },
870 { 0x4200, "\xE4\x88\x80" },
871 { 0x4400, "\xE4\x90\x80" },
872 { 0x4800, "\xE4\xA0\x80" },
873 { 0x5000, "\xE5\x80\x80" },
874 { 0x6000, "\xE6\x80\x80" },
875 { 0x7FFF, "\xE7\xBF\xBF" },
876 { 0x8000, "\xE8\x80\x80" },
877 { 0x8001, "\xE8\x80\x81" },
878 { 0x8002, "\xE8\x80\x82" },
879 { 0x8004, "\xE8\x80\x84" },
880 { 0x8008, "\xE8\x80\x88" },
881 { 0x8010, "\xE8\x80\x90" },
882 { 0x8020, "\xE8\x80\xA0" },
883 { 0x8040, "\xE8\x81\x80" },
884 { 0x8080, "\xE8\x82\x80" },
885 { 0x8100, "\xE8\x84\x80" },
886 { 0x8200, "\xE8\x88\x80" },
887 { 0x8400, "\xE8\x90\x80" },
888 { 0x8800, "\xE8\xA0\x80" },
889 { 0x9000, "\xE9\x80\x80" },
890 { 0xA000, "\xEA\x80\x80" },
891 { 0xC000, "\xEC\x80\x80" },
892 { 0xFFFF, "\xEF\xBF\xBF" }
900 struct utf16 utf16
[] = {
901 { 0x00010000, { 0xD800, 0xDC00 } },
902 { 0x00010001, { 0xD800, 0xDC01 } },
903 { 0x00010002, { 0xD800, 0xDC02 } },
904 { 0x00010003, { 0xD800, 0xDC03 } },
905 { 0x00010004, { 0xD800, 0xDC04 } },
906 { 0x00010007, { 0xD800, 0xDC07 } },
907 { 0x00010008, { 0xD800, 0xDC08 } },
908 { 0x0001000F, { 0xD800, 0xDC0F } },
909 { 0x00010010, { 0xD800, 0xDC10 } },
910 { 0x0001001F, { 0xD800, 0xDC1F } },
911 { 0x00010020, { 0xD800, 0xDC20 } },
912 { 0x0001003F, { 0xD800, 0xDC3F } },
913 { 0x00010040, { 0xD800, 0xDC40 } },
914 { 0x0001007F, { 0xD800, 0xDC7F } },
915 { 0x00010080, { 0xD800, 0xDC80 } },
916 { 0x00010081, { 0xD800, 0xDC81 } },
917 { 0x00010082, { 0xD800, 0xDC82 } },
918 { 0x00010084, { 0xD800, 0xDC84 } },
919 { 0x00010088, { 0xD800, 0xDC88 } },
920 { 0x00010090, { 0xD800, 0xDC90 } },
921 { 0x000100A0, { 0xD800, 0xDCA0 } },
922 { 0x000100C0, { 0xD800, 0xDCC0 } },
923 { 0x000100FF, { 0xD800, 0xDCFF } },
924 { 0x00010100, { 0xD800, 0xDD00 } },
925 { 0x00010101, { 0xD800, 0xDD01 } },
926 { 0x00010102, { 0xD800, 0xDD02 } },
927 { 0x00010104, { 0xD800, 0xDD04 } },
928 { 0x00010108, { 0xD800, 0xDD08 } },
929 { 0x00010110, { 0xD800, 0xDD10 } },
930 { 0x00010120, { 0xD800, 0xDD20 } },
931 { 0x00010140, { 0xD800, 0xDD40 } },
932 { 0x00010180, { 0xD800, 0xDD80 } },
933 { 0x000101FF, { 0xD800, 0xDDFF } },
934 { 0x00010200, { 0xD800, 0xDE00 } },
935 { 0x00010201, { 0xD800, 0xDE01 } },
936 { 0x00010202, { 0xD800, 0xDE02 } },
937 { 0x00010204, { 0xD800, 0xDE04 } },
938 { 0x00010208, { 0xD800, 0xDE08 } },
939 { 0x00010210, { 0xD800, 0xDE10 } },
940 { 0x00010220, { 0xD800, 0xDE20 } },
941 { 0x00010240, { 0xD800, 0xDE40 } },
942 { 0x00010280, { 0xD800, 0xDE80 } },
943 { 0x00010300, { 0xD800, 0xDF00 } },
944 { 0x000103FF, { 0xD800, 0xDFFF } },
945 { 0x00010400, { 0xD801, 0xDC00 } },
946 { 0x00010401, { 0xD801, 0xDC01 } },
947 { 0x00010402, { 0xD801, 0xDC02 } },
948 { 0x00010404, { 0xD801, 0xDC04 } },
949 { 0x00010408, { 0xD801, 0xDC08 } },
950 { 0x00010410, { 0xD801, 0xDC10 } },
951 { 0x00010420, { 0xD801, 0xDC20 } },
952 { 0x00010440, { 0xD801, 0xDC40 } },
953 { 0x00010480, { 0xD801, 0xDC80 } },
954 { 0x00010500, { 0xD801, 0xDD00 } },
955 { 0x00010600, { 0xD801, 0xDE00 } },
956 { 0x000107FF, { 0xD801, 0xDFFF } },
957 { 0x00010800, { 0xD802, 0xDC00 } },
958 { 0x00010801, { 0xD802, 0xDC01 } },
959 { 0x00010802, { 0xD802, 0xDC02 } },
960 { 0x00010804, { 0xD802, 0xDC04 } },
961 { 0x00010808, { 0xD802, 0xDC08 } },
962 { 0x00010810, { 0xD802, 0xDC10 } },
963 { 0x00010820, { 0xD802, 0xDC20 } },
964 { 0x00010840, { 0xD802, 0xDC40 } },
965 { 0x00010880, { 0xD802, 0xDC80 } },
966 { 0x00010900, { 0xD802, 0xDD00 } },
967 { 0x00010A00, { 0xD802, 0xDE00 } },
968 { 0x00010C00, { 0xD803, 0xDC00 } },
969 { 0x00010FFF, { 0xD803, 0xDFFF } },
970 { 0x00011000, { 0xD804, 0xDC00 } },
971 { 0x00011001, { 0xD804, 0xDC01 } },
972 { 0x00011002, { 0xD804, 0xDC02 } },
973 { 0x00011004, { 0xD804, 0xDC04 } },
974 { 0x00011008, { 0xD804, 0xDC08 } },
975 { 0x00011010, { 0xD804, 0xDC10 } },
976 { 0x00011020, { 0xD804, 0xDC20 } },
977 { 0x00011040, { 0xD804, 0xDC40 } },
978 { 0x00011080, { 0xD804, 0xDC80 } },
979 { 0x00011100, { 0xD804, 0xDD00 } },
980 { 0x00011200, { 0xD804, 0xDE00 } },
981 { 0x00011400, { 0xD805, 0xDC00 } },
982 { 0x00011800, { 0xD806, 0xDC00 } },
983 { 0x00011FFF, { 0xD807, 0xDFFF } },
984 { 0x00012000, { 0xD808, 0xDC00 } },
985 { 0x00012001, { 0xD808, 0xDC01 } },
986 { 0x00012002, { 0xD808, 0xDC02 } },
987 { 0x00012004, { 0xD808, 0xDC04 } },
988 { 0x00012008, { 0xD808, 0xDC08 } },
989 { 0x00012010, { 0xD808, 0xDC10 } },
990 { 0x00012020, { 0xD808, 0xDC20 } },
991 { 0x00012040, { 0xD808, 0xDC40 } },
992 { 0x00012080, { 0xD808, 0xDC80 } },
993 { 0x00012100, { 0xD808, 0xDD00 } },
994 { 0x00012200, { 0xD808, 0xDE00 } },
995 { 0x00012400, { 0xD809, 0xDC00 } },
996 { 0x00012800, { 0xD80A, 0xDC00 } },
997 { 0x00013000, { 0xD80C, 0xDC00 } },
998 { 0x00013FFF, { 0xD80F, 0xDFFF } },
999 { 0x00014000, { 0xD810, 0xDC00 } },
1000 { 0x00014001, { 0xD810, 0xDC01 } },
1001 { 0x00014002, { 0xD810, 0xDC02 } },
1002 { 0x00014004, { 0xD810, 0xDC04 } },
1003 { 0x00014008, { 0xD810, 0xDC08 } },
1004 { 0x00014010, { 0xD810, 0xDC10 } },
1005 { 0x00014020, { 0xD810, 0xDC20 } },
1006 { 0x00014040, { 0xD810, 0xDC40 } },
1007 { 0x00014080, { 0xD810, 0xDC80 } },
1008 { 0x00014100, { 0xD810, 0xDD00 } },
1009 { 0x00014200, { 0xD810, 0xDE00 } },
1010 { 0x00014400, { 0xD811, 0xDC00 } },
1011 { 0x00014800, { 0xD812, 0xDC00 } },
1012 { 0x00015000, { 0xD814, 0xDC00 } },
1013 { 0x00016000, { 0xD818, 0xDC00 } },
1014 { 0x00017FFF, { 0xD81F, 0xDFFF } },
1015 { 0x00018000, { 0xD820, 0xDC00 } },
1016 { 0x00018001, { 0xD820, 0xDC01 } },
1017 { 0x00018002, { 0xD820, 0xDC02 } },
1018 { 0x00018004, { 0xD820, 0xDC04 } },
1019 { 0x00018008, { 0xD820, 0xDC08 } },
1020 { 0x00018010, { 0xD820, 0xDC10 } },
1021 { 0x00018020, { 0xD820, 0xDC20 } },
1022 { 0x00018040, { 0xD820, 0xDC40 } },
1023 { 0x00018080, { 0xD820, 0xDC80 } },
1024 { 0x00018100, { 0xD820, 0xDD00 } },
1025 { 0x00018200, { 0xD820, 0xDE00 } },
1026 { 0x00018400, { 0xD821, 0xDC00 } },
1027 { 0x00018800, { 0xD822, 0xDC00 } },
1028 { 0x00019000, { 0xD824, 0xDC00 } },
1029 { 0x0001A000, { 0xD828, 0xDC00 } },
1030 { 0x0001C000, { 0xD830, 0xDC00 } },
1031 { 0x0001FFFF, { 0xD83F, 0xDFFF } },
1032 { 0x00020000, { 0xD840, 0xDC00 } },
1033 { 0x00020001, { 0xD840, 0xDC01 } },
1034 { 0x00020002, { 0xD840, 0xDC02 } },
1035 { 0x00020004, { 0xD840, 0xDC04 } },
1036 { 0x00020008, { 0xD840, 0xDC08 } },
1037 { 0x00020010, { 0xD840, 0xDC10 } },
1038 { 0x00020020, { 0xD840, 0xDC20 } },
1039 { 0x00020040, { 0xD840, 0xDC40 } },
1040 { 0x00020080, { 0xD840, 0xDC80 } },
1041 { 0x00020100, { 0xD840, 0xDD00 } },
1042 { 0x00020200, { 0xD840, 0xDE00 } },
1043 { 0x00020400, { 0xD841, 0xDC00 } },
1044 { 0x00020800, { 0xD842, 0xDC00 } },
1045 { 0x00021000, { 0xD844, 0xDC00 } },
1046 { 0x00022000, { 0xD848, 0xDC00 } },
1047 { 0x00024000, { 0xD850, 0xDC00 } },
1048 { 0x00028000, { 0xD860, 0xDC00 } },
1049 { 0x0002FFFF, { 0xD87F, 0xDFFF } },
1050 { 0x00030000, { 0xD880, 0xDC00 } },
1051 { 0x00030001, { 0xD880, 0xDC01 } },
1052 { 0x00030002, { 0xD880, 0xDC02 } },
1053 { 0x00030004, { 0xD880, 0xDC04 } },
1054 { 0x00030008, { 0xD880, 0xDC08 } },
1055 { 0x00030010, { 0xD880, 0xDC10 } },
1056 { 0x00030020, { 0xD880, 0xDC20 } },
1057 { 0x00030040, { 0xD880, 0xDC40 } },
1058 { 0x00030080, { 0xD880, 0xDC80 } },
1059 { 0x00030100, { 0xD880, 0xDD00 } },
1060 { 0x00030200, { 0xD880, 0xDE00 } },
1061 { 0x00030400, { 0xD881, 0xDC00 } },
1062 { 0x00030800, { 0xD882, 0xDC00 } },
1063 { 0x00031000, { 0xD884, 0xDC00 } },
1064 { 0x00032000, { 0xD888, 0xDC00 } },
1065 { 0x00034000, { 0xD890, 0xDC00 } },
1066 { 0x00038000, { 0xD8A0, 0xDC00 } },
1067 { 0x0003FFFF, { 0xD8BF, 0xDFFF } },
1068 { 0x00040000, { 0xD8C0, 0xDC00 } },
1069 { 0x00040001, { 0xD8C0, 0xDC01 } },
1070 { 0x00040002, { 0xD8C0, 0xDC02 } },
1071 { 0x00040004, { 0xD8C0, 0xDC04 } },
1072 { 0x00040008, { 0xD8C0, 0xDC08 } },
1073 { 0x00040010, { 0xD8C0, 0xDC10 } },
1074 { 0x00040020, { 0xD8C0, 0xDC20 } },
1075 { 0x00040040, { 0xD8C0, 0xDC40 } },
1076 { 0x00040080, { 0xD8C0, 0xDC80 } },
1077 { 0x00040100, { 0xD8C0, 0xDD00 } },
1078 { 0x00040200, { 0xD8C0, 0xDE00 } },
1079 { 0x00040400, { 0xD8C1, 0xDC00 } },
1080 { 0x00040800, { 0xD8C2, 0xDC00 } },
1081 { 0x00041000, { 0xD8C4, 0xDC00 } },
1082 { 0x00042000, { 0xD8C8, 0xDC00 } },
1083 { 0x00044000, { 0xD8D0, 0xDC00 } },
1084 { 0x00048000, { 0xD8E0, 0xDC00 } },
1085 { 0x0004FFFF, { 0xD8FF, 0xDFFF } },
1086 { 0x00050000, { 0xD900, 0xDC00 } },
1087 { 0x00050001, { 0xD900, 0xDC01 } },
1088 { 0x00050002, { 0xD900, 0xDC02 } },
1089 { 0x00050004, { 0xD900, 0xDC04 } },
1090 { 0x00050008, { 0xD900, 0xDC08 } },
1091 { 0x00050010, { 0xD900, 0xDC10 } },
1092 { 0x00050020, { 0xD900, 0xDC20 } },
1093 { 0x00050040, { 0xD900, 0xDC40 } },
1094 { 0x00050080, { 0xD900, 0xDC80 } },
1095 { 0x00050100, { 0xD900, 0xDD00 } },
1096 { 0x00050200, { 0xD900, 0xDE00 } },
1097 { 0x00050400, { 0xD901, 0xDC00 } },
1098 { 0x00050800, { 0xD902, 0xDC00 } },
1099 { 0x00051000, { 0xD904, 0xDC00 } },
1100 { 0x00052000, { 0xD908, 0xDC00 } },
1101 { 0x00054000, { 0xD910, 0xDC00 } },
1102 { 0x00058000, { 0xD920, 0xDC00 } },
1103 { 0x00060000, { 0xD940, 0xDC00 } },
1104 { 0x00070000, { 0xD980, 0xDC00 } },
1105 { 0x0007FFFF, { 0xD9BF, 0xDFFF } },
1106 { 0x00080000, { 0xD9C0, 0xDC00 } },
1107 { 0x00080001, { 0xD9C0, 0xDC01 } },
1108 { 0x00080002, { 0xD9C0, 0xDC02 } },
1109 { 0x00080004, { 0xD9C0, 0xDC04 } },
1110 { 0x00080008, { 0xD9C0, 0xDC08 } },
1111 { 0x00080010, { 0xD9C0, 0xDC10 } },
1112 { 0x00080020, { 0xD9C0, 0xDC20 } },
1113 { 0x00080040, { 0xD9C0, 0xDC40 } },
1114 { 0x00080080, { 0xD9C0, 0xDC80 } },
1115 { 0x00080100, { 0xD9C0, 0xDD00 } },
1116 { 0x00080200, { 0xD9C0, 0xDE00 } },
1117 { 0x00080400, { 0xD9C1, 0xDC00 } },
1118 { 0x00080800, { 0xD9C2, 0xDC00 } },
1119 { 0x00081000, { 0xD9C4, 0xDC00 } },
1120 { 0x00082000, { 0xD9C8, 0xDC00 } },
1121 { 0x00084000, { 0xD9D0, 0xDC00 } },
1122 { 0x00088000, { 0xD9E0, 0xDC00 } },
1123 { 0x0008FFFF, { 0xD9FF, 0xDFFF } },
1124 { 0x00090000, { 0xDA00, 0xDC00 } },
1125 { 0x00090001, { 0xDA00, 0xDC01 } },
1126 { 0x00090002, { 0xDA00, 0xDC02 } },
1127 { 0x00090004, { 0xDA00, 0xDC04 } },
1128 { 0x00090008, { 0xDA00, 0xDC08 } },
1129 { 0x00090010, { 0xDA00, 0xDC10 } },
1130 { 0x00090020, { 0xDA00, 0xDC20 } },
1131 { 0x00090040, { 0xDA00, 0xDC40 } },
1132 { 0x00090080, { 0xDA00, 0xDC80 } },
1133 { 0x00090100, { 0xDA00, 0xDD00 } },
1134 { 0x00090200, { 0xDA00, 0xDE00 } },
1135 { 0x00090400, { 0xDA01, 0xDC00 } },
1136 { 0x00090800, { 0xDA02, 0xDC00 } },
1137 { 0x00091000, { 0xDA04, 0xDC00 } },
1138 { 0x00092000, { 0xDA08, 0xDC00 } },
1139 { 0x00094000, { 0xDA10, 0xDC00 } },
1140 { 0x00098000, { 0xDA20, 0xDC00 } },
1141 { 0x000A0000, { 0xDA40, 0xDC00 } },
1142 { 0x000B0000, { 0xDA80, 0xDC00 } },
1143 { 0x000C0000, { 0xDAC0, 0xDC00 } },
1144 { 0x000D0000, { 0xDB00, 0xDC00 } },
1145 { 0x000FFFFF, { 0xDBBF, 0xDFFF } },
1146 { 0x0010FFFF, { 0xDBFF, 0xDFFF } }
1150 /* illegal utf8 sequences */
1151 char *utf8_bad
[] = {
1160 "\xF8\x80\x80\x80\x80",
1161 "\xF8\x88\x80\x80\x80",
1162 "\xF8\x92\x80\x80\x80",
1163 "\xF8\x9F\xBF\xBF\xBF",
1164 "\xF8\xA0\x80\x80\x80",
1165 "\xF8\xA8\x80\x80\x80",
1166 "\xF8\xB0\x80\x80\x80",
1167 "\xF8\xBF\xBF\xBF\xBF",
1168 "\xF9\x80\x80\x80\x88",
1169 "\xF9\x84\x80\x80\x80",
1170 "\xF9\xBF\xBF\xBF\xBF",
1171 "\xFA\x80\x80\x80\x80",
1172 "\xFA\x90\x80\x80\x80",
1173 "\xFB\xBF\xBF\xBF\xBF",
1174 "\xFC\x84\x80\x80\x80\x81",
1175 "\xFC\x85\x80\x80\x80\x80",
1176 "\xFC\x86\x80\x80\x80\x80",
1177 "\xFC\x87\xBF\xBF\xBF\xBF",
1178 "\xFC\x88\xA0\x80\x80\x80",
1179 "\xFC\x89\x80\x80\x80\x80",
1180 "\xFC\x8A\x80\x80\x80\x80",
1181 "\xFC\x90\x80\x80\x80\x82",
1182 "\xFD\x80\x80\x80\x80\x80",
1183 "\xFD\xBF\xBF\xBF\xBF\xBF",
1190 "\xED\xA0\x80\xE0\xBF\xBF",
1197 unsigned char *utf8
,
1201 fprintf(stdout
, "%s ", word
);
1202 for( ; *utf8
; utf8
++ ) {
1203 fprintf(stdout
, "%02.2x ", (unsigned int)*utf8
);
1205 fprintf(stdout
, "%s", end
);
1214 PRBool rv
= PR_TRUE
;
1217 for( i
= 0; i
< sizeof(ucs4
)/sizeof(ucs4
[0]); i
++ ) {
1218 struct ucs4
*e
= &ucs4
[i
];
1220 unsigned char utf8
[8];
1221 unsigned int len
= 0;
1224 (void)memset(utf8
, 0, sizeof(utf8
));
1226 result
= sec_port_ucs4_utf8_conversion_function(PR_FALSE
,
1227 (unsigned char *)&e
->c
, sizeof(e
->c
), utf8
, sizeof(utf8
), &len
);
1230 fprintf(stdout
, "Failed to convert UCS-4 0x%08.8x to UTF-8\n", e
->c
);
1235 if( (len
>= sizeof(utf8
)) ||
1236 (strlen(e
->utf8
) != len
) ||
1237 (utf8
[len
] = '\0', 0 != strcmp(e
->utf8
, utf8
)) ) {
1238 fprintf(stdout
, "Wrong conversion of UCS-4 0x%08.8x to UTF-8: ", e
->c
);
1239 dump_utf8("expected", e
->utf8
, ", ");
1240 dump_utf8("received", utf8
, "\n");
1245 result
= sec_port_ucs4_utf8_conversion_function(PR_TRUE
,
1246 utf8
, len
, (unsigned char *)&back
, sizeof(back
), &len
);
1249 dump_utf8("Failed to convert UTF-8", utf8
, "to UCS-4\n");
1254 if( (sizeof(back
) != len
) || (e
->c
!= back
) ) {
1255 dump_utf8("Wrong conversion of UTF-8", utf8
, " to UCS-4:");
1256 fprintf(stdout
, "expected 0x%08.8x, received 0x%08.8x\n", e
->c
, back
);
1271 PRBool rv
= PR_TRUE
;
1274 for( i
= 0; i
< sizeof(ucs2
)/sizeof(ucs2
[0]); i
++ ) {
1275 struct ucs2
*e
= &ucs2
[i
];
1277 unsigned char utf8
[8];
1278 unsigned int len
= 0;
1281 (void)memset(utf8
, 0, sizeof(utf8
));
1283 result
= sec_port_ucs2_utf8_conversion_function(PR_FALSE
,
1284 (unsigned char *)&e
->c
, sizeof(e
->c
), utf8
, sizeof(utf8
), &len
);
1287 fprintf(stdout
, "Failed to convert UCS-2 0x%04.4x to UTF-8\n", e
->c
);
1292 if( (len
>= sizeof(utf8
)) ||
1293 (strlen(e
->utf8
) != len
) ||
1294 (utf8
[len
] = '\0', 0 != strcmp(e
->utf8
, utf8
)) ) {
1295 fprintf(stdout
, "Wrong conversion of UCS-2 0x%04.4x to UTF-8: ", e
->c
);
1296 dump_utf8("expected", e
->utf8
, ", ");
1297 dump_utf8("received", utf8
, "\n");
1302 result
= sec_port_ucs2_utf8_conversion_function(PR_TRUE
,
1303 utf8
, len
, (unsigned char *)&back
, sizeof(back
), &len
);
1306 dump_utf8("Failed to convert UTF-8", utf8
, "to UCS-2\n");
1311 if( (sizeof(back
) != len
) || (e
->c
!= back
) ) {
1312 dump_utf8("Wrong conversion of UTF-8", utf8
, "to UCS-2:");
1313 fprintf(stdout
, "expected 0x%08.8x, received 0x%08.8x\n", e
->c
, back
);
1328 PRBool rv
= PR_TRUE
;
1331 for( i
= 0; i
< sizeof(utf16
)/sizeof(utf16
[0]); i
++ ) {
1332 struct utf16
*e
= &utf16
[i
];
1334 unsigned char utf8
[8];
1335 unsigned int len
= 0;
1336 PRUint32 back32
= 0;
1339 (void)memset(utf8
, 0, sizeof(utf8
));
1341 result
= sec_port_ucs2_utf8_conversion_function(PR_FALSE
,
1342 (unsigned char *)&e
->w
[0], sizeof(e
->w
), utf8
, sizeof(utf8
), &len
);
1345 fprintf(stdout
, "Failed to convert UTF-16 0x%04.4x 0x%04.4x to UTF-8\n",
1351 result
= sec_port_ucs4_utf8_conversion_function(PR_TRUE
,
1352 utf8
, len
, (unsigned char *)&back32
, sizeof(back32
), &len
);
1355 fprintf(stdout
, "Failed to convert UTF-16 0x%04.4x 0x%04.4x to UTF-8: "
1356 "unexpected len %d\n", e
->w
[0], e
->w
[1], len
);
1361 utf8
[len
] = '\0'; /* null-terminate for printing */
1364 dump_utf8("Failed to convert UTF-8", utf8
, "to UCS-4 (utf-16 test)\n");
1369 if( (sizeof(back32
) != len
) || (e
->c
!= back32
) ) {
1370 fprintf(stdout
, "Wrong conversion of UTF-16 0x%04.4x 0x%04.4x ",
1372 dump_utf8("to UTF-8", utf8
, "and then to UCS-4: ");
1373 if( sizeof(back32
) != len
) {
1374 fprintf(stdout
, "len is %d\n", len
);
1376 fprintf(stdout
, "expected 0x%08.8x, received 0x%08.8x\n", e
->c
, back32
);
1382 (void)memset(utf8
, 0, sizeof(utf8
));
1383 back
[0] = back
[1] = 0;
1385 result
= sec_port_ucs4_utf8_conversion_function(PR_FALSE
,
1386 (unsigned char *)&e
->c
, sizeof(e
->c
), utf8
, sizeof(utf8
), &len
);
1389 fprintf(stdout
, "Failed to convert UCS-4 0x%08.8x to UTF-8 (utf-16 test)\n",
1395 result
= sec_port_ucs2_utf8_conversion_function(PR_TRUE
,
1396 utf8
, len
, (unsigned char *)&back
[0], sizeof(back
), &len
);
1399 fprintf(stdout
, "Failed to convert UCS-4 0x%08.8x to UTF-8: "
1400 "unexpected len %d\n", e
->c
, len
);
1405 utf8
[len
] = '\0'; /* null-terminate for printing */
1408 dump_utf8("Failed to convert UTF-8", utf8
, "to UTF-16\n");
1413 if( (sizeof(back
) != len
) || (e
->w
[0] != back
[0]) || (e
->w
[1] != back
[1]) ) {
1414 fprintf(stdout
, "Wrong conversion of UCS-4 0x%08.8x to UTF-8", e
->c
);
1415 dump_utf8("", utf8
, "and then to UTF-16:");
1416 if( sizeof(back
) != len
) {
1417 fprintf(stdout
, "len is %d\n", len
);
1419 fprintf(stdout
, "expected 0x%04.4x 0x%04.4x, received 0x%04.4x 0x%04.4xx\n",
1420 e
->w
[0], e
->w
[1], back
[0], back
[1]);
1436 PRBool rv
= PR_TRUE
;
1439 for( i
= 0; i
< sizeof(utf8_bad
)/sizeof(utf8_bad
[0]); i
++ ) {
1441 unsigned char destbuf
[30];
1442 unsigned int len
= 0;
1444 result
= sec_port_ucs2_utf8_conversion_function(PR_TRUE
,
1445 (unsigned char *)utf8_bad
[i
], strlen(utf8_bad
[i
]), destbuf
, sizeof(destbuf
), &len
);
1448 dump_utf8("Failed to detect bad UTF-8 string converting to UCS2: ", utf8_bad
[i
], "\n");
1452 result
= sec_port_ucs4_utf8_conversion_function(PR_TRUE
,
1453 (unsigned char *)utf8_bad
[i
], strlen(utf8_bad
[i
]), destbuf
, sizeof(destbuf
), &len
);
1456 dump_utf8("Failed to detect bad UTF-8 string converting to UCS4: ", utf8_bad
[i
], "\n");
1472 PRBool rv
= PR_TRUE
;
1475 for( i
= 0; i
< sizeof(ucs2
)/sizeof(ucs2
[0]); i
++ ) {
1476 struct ucs2
*e
= &ucs2
[i
];
1478 unsigned char iso88591
;
1479 unsigned char utf8
[3];
1480 unsigned int len
= 0;
1482 if (ntohs(e
->c
) > 0xFF) continue;
1484 (void)memset(utf8
, 0, sizeof(utf8
));
1485 iso88591
= ntohs(e
->c
);
1487 result
= sec_port_iso88591_utf8_conversion_function(&iso88591
,
1488 1, utf8
, sizeof(utf8
), &len
);
1491 fprintf(stdout
, "Failed to convert ISO-8859-1 0x%02.2x to UTF-8\n", iso88591
);
1496 if( (len
>= sizeof(utf8
)) ||
1497 (strlen(e
->utf8
) != len
) ||
1498 (utf8
[len
] = '\0', 0 != strcmp(e
->utf8
, utf8
)) ) {
1499 fprintf(stdout
, "Wrong conversion of ISO-8859-1 0x%02.2x to UTF-8: ", iso88591
);
1500 dump_utf8("expected", e
->utf8
, ", ");
1501 dump_utf8("received", utf8
, "\n");
1517 PRBool rv
= PR_TRUE
;
1521 unsigned char utf8
[8];
1522 unsigned int len
= 0;
1526 (void)memset(utf8
, 1, sizeof(utf8
));
1528 result
= sec_port_ucs4_utf8_conversion_function(PR_FALSE
,
1529 (unsigned char *)&lzero
, sizeof(lzero
), utf8
, sizeof(utf8
), &len
);
1532 fprintf(stdout
, "Failed to convert UCS-4 0x00000000 to UTF-8\n");
1534 } else if( 1 != len
) {
1535 fprintf(stdout
, "Wrong conversion of UCS-4 0x00000000: len = %d\n", len
);
1537 } else if( '\0' != *utf8
) {
1538 fprintf(stdout
, "Wrong conversion of UCS-4 0x00000000: expected 00 ,"
1539 "received %02.2x\n", (unsigned int)*utf8
);
1543 result
= sec_port_ucs4_utf8_conversion_function(PR_TRUE
,
1544 "", 1, (unsigned char *)&lback
, sizeof(lback
), &len
);
1547 fprintf(stdout
, "Failed to convert UTF-8 00 to UCS-4\n");
1549 } else if( 4 != len
) {
1550 fprintf(stdout
, "Wrong conversion of UTF-8 00 to UCS-4: len = %d\n", len
);
1552 } else if( 0 != lback
) {
1553 fprintf(stdout
, "Wrong conversion of UTF-8 00 to UCS-4: "
1554 "expected 0x00000000, received 0x%08.8x\n", lback
);
1558 (void)memset(utf8
, 1, sizeof(utf8
));
1560 result
= sec_port_ucs2_utf8_conversion_function(PR_FALSE
,
1561 (unsigned char *)&szero
, sizeof(szero
), utf8
, sizeof(utf8
), &len
);
1564 fprintf(stdout
, "Failed to convert UCS-2 0x0000 to UTF-8\n");
1566 } else if( 1 != len
) {
1567 fprintf(stdout
, "Wrong conversion of UCS-2 0x0000: len = %d\n", len
);
1569 } else if( '\0' != *utf8
) {
1570 fprintf(stdout
, "Wrong conversion of UCS-2 0x0000: expected 00 ,"
1571 "received %02.2x\n", (unsigned int)*utf8
);
1575 result
= sec_port_ucs2_utf8_conversion_function(PR_TRUE
,
1576 "", 1, (unsigned char *)&sback
, sizeof(sback
), &len
);
1579 fprintf(stdout
, "Failed to convert UTF-8 00 to UCS-2\n");
1581 } else if( 2 != len
) {
1582 fprintf(stdout
, "Wrong conversion of UTF-8 00 to UCS-2: len = %d\n", len
);
1584 } else if( 0 != sback
) {
1585 fprintf(stdout
, "Wrong conversion of UTF-8 00 to UCS-2: "
1586 "expected 0x0000, received 0x%04.4x\n", sback
);
1600 unsigned int len
, lenout
;
1608 ucs4s
= (PRUint32
*)calloc(sizeof(ucs4
)/sizeof(ucs4
[0]), sizeof(PRUint32
));
1609 ucs2s
= (PRUint16
*)calloc(sizeof(ucs2
)/sizeof(ucs2
[0]), sizeof(PRUint16
));
1611 if( ((PRUint32
*)NULL
== ucs4s
) || ((PRUint16
*)NULL
== ucs2s
) ) {
1612 fprintf(stderr
, "out of memory\n");
1617 for( i
= 0; i
< sizeof(ucs4
)/sizeof(ucs4
[0]); i
++ ) {
1618 ucs4s
[i
] = ucs4
[i
].c
;
1619 len
+= strlen(ucs4
[i
].utf8
);
1622 ucs4_utf8
= (char *)malloc(len
);
1625 for( i
= 0; i
< sizeof(ucs2
)/sizeof(ucs2
[0]); i
++ ) {
1626 ucs2s
[i
] = ucs2
[i
].c
;
1627 len
+= strlen(ucs2
[i
].utf8
);
1630 ucs2_utf8
= (char *)malloc(len
);
1632 if( ((char *)NULL
== ucs4_utf8
) || ((char *)NULL
== ucs2_utf8
) ) {
1633 fprintf(stderr
, "out of memory\n");
1638 for( i
= 0; i
< sizeof(ucs4
)/sizeof(ucs4
[0]); i
++ ) {
1639 strcat(ucs4_utf8
, ucs4
[i
].utf8
);
1643 for( i
= 0; i
< sizeof(ucs2
)/sizeof(ucs2
[0]); i
++ ) {
1644 strcat(ucs2_utf8
, ucs2
[i
].utf8
);
1647 /* UTF-8 -> UCS-4 */
1648 len
= sizeof(ucs4
)/sizeof(ucs4
[0]) * sizeof(PRUint32
);
1649 tmp
= calloc(len
, 1);
1650 if( (void *)NULL
== tmp
) {
1651 fprintf(stderr
, "out of memory\n");
1655 result
= sec_port_ucs4_utf8_conversion_function(PR_TRUE
,
1656 ucs4_utf8
, strlen(ucs4_utf8
), tmp
, len
, &lenout
);
1658 fprintf(stdout
, "Failed to convert much UTF-8 to UCS-4\n");
1662 if( lenout
!= len
) {
1663 fprintf(stdout
, "Unexpected length converting much UTF-8 to UCS-4\n");
1667 if( 0 != memcmp(ucs4s
, tmp
, len
) ) {
1668 fprintf(stdout
, "Wrong conversion of much UTF-8 to UCS-4\n");
1672 free(tmp
); tmp
= (void *)NULL
;
1674 /* UCS-4 -> UTF-8 */
1675 len
= strlen(ucs4_utf8
);
1676 tmp
= calloc(len
, 1);
1677 if( (void *)NULL
== tmp
) {
1678 fprintf(stderr
, "out of memory\n");
1682 result
= sec_port_ucs4_utf8_conversion_function(PR_FALSE
,
1683 (unsigned char *)ucs4s
, sizeof(ucs4
)/sizeof(ucs4
[0]) * sizeof(PRUint32
),
1686 fprintf(stdout
, "Failed to convert much UCS-4 to UTF-8\n");
1690 if( lenout
!= len
) {
1691 fprintf(stdout
, "Unexpected length converting much UCS-4 to UTF-8\n");
1695 if( 0 != strncmp(ucs4_utf8
, tmp
, len
) ) {
1696 fprintf(stdout
, "Wrong conversion of much UCS-4 to UTF-8\n");
1700 free(tmp
); tmp
= (void *)NULL
;
1702 /* UTF-8 -> UCS-2 */
1703 len
= sizeof(ucs2
)/sizeof(ucs2
[0]) * sizeof(PRUint16
);
1704 tmp
= calloc(len
, 1);
1705 if( (void *)NULL
== tmp
) {
1706 fprintf(stderr
, "out of memory\n");
1710 result
= sec_port_ucs2_utf8_conversion_function(PR_TRUE
,
1711 ucs2_utf8
, strlen(ucs2_utf8
), tmp
, len
, &lenout
);
1713 fprintf(stdout
, "Failed to convert much UTF-8 to UCS-2\n");
1717 if( lenout
!= len
) {
1718 fprintf(stdout
, "Unexpected length converting much UTF-8 to UCS-2\n");
1722 if( 0 != memcmp(ucs2s
, tmp
, len
) ) {
1723 fprintf(stdout
, "Wrong conversion of much UTF-8 to UCS-2\n");
1727 free(tmp
); tmp
= (void *)NULL
;
1729 /* UCS-2 -> UTF-8 */
1730 len
= strlen(ucs2_utf8
);
1731 tmp
= calloc(len
, 1);
1732 if( (void *)NULL
== tmp
) {
1733 fprintf(stderr
, "out of memory\n");
1737 result
= sec_port_ucs2_utf8_conversion_function(PR_FALSE
,
1738 (unsigned char *)ucs2s
, sizeof(ucs2
)/sizeof(ucs2
[0]) * sizeof(PRUint16
),
1741 fprintf(stdout
, "Failed to convert much UCS-2 to UTF-8\n");
1745 if( lenout
!= len
) {
1746 fprintf(stdout
, "Unexpected length converting much UCS-2 to UTF-8\n");
1750 if( 0 != strncmp(ucs2_utf8
, tmp
, len
) ) {
1751 fprintf(stdout
, "Wrong conversion of much UCS-2 to UTF-8\n");
1755 /* implement UTF16 */
1767 if( (void *)NULL
!= tmp
) free(tmp
);
1778 * The implementation (now) expects the 16- and 32-bit characters
1779 * to be in network byte order, not host byte order. Therefore I
1780 * have to byteswap all those test vectors above. hton[ls] may be
1781 * functions, so I have to do this dynamically. If you want to
1782 * use this code to do host byte order conversions, just remove
1783 * the call in main() to this function.
1788 for( i
= 0; i
< sizeof(ucs4
)/sizeof(ucs4
[0]); i
++ ) {
1789 struct ucs4
*e
= &ucs4
[i
];
1793 for( i
= 0; i
< sizeof(ucs2
)/sizeof(ucs2
[0]); i
++ ) {
1794 struct ucs2
*e
= &ucs2
[i
];
1798 for( i
= 0; i
< sizeof(utf16
)/sizeof(utf16
[0]); i
++ ) {
1799 struct utf16
*e
= &utf16
[i
];
1801 e
->w
[0] = htons(e
->w
[0]);
1802 e
->w
[1] = htons(e
->w
[1]);
1817 if( test_ucs4_chars() &&
1818 test_ucs2_chars() &&
1819 test_utf16_chars() &&
1820 test_utf8_bad_chars() &&
1821 test_iso88591_chars() &&
1823 test_multichars() &&
1825 fprintf(stderr
, "PASS\n");
1828 fprintf(stderr
, "FAIL\n");
1833 #endif /* TEST_UTF8 */