1 /* Copyright (C) 1999-2001 Free Software Foundation, Inc.
2 This file is part of the GNU LIBICONV Tools.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software Foundation,
16 Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
19 * Generates a CJK character set table from a .TXT table as found on
20 * ftp.unicode.org or in the X nls directory.
23 * ./cjk_tab_to_h GB2312.1980-0 gb2312 > gb2312.h < gb2312
24 * ./cjk_tab_to_h JISX0208.1983-0 jisx0208 > jisx0208.h < jis0208
25 * ./cjk_tab_to_h KSC5601.1987-0 ksc5601 > ksc5601.h < ksc5601
27 * ./cjk_tab_to_h GB2312.1980-0 gb2312 > gb2312.h < GB2312.TXT
28 * ./cjk_tab_to_h JISX0208.1983-0 jisx0208 > jisx0208.h < JIS0208.TXT
29 * ./cjk_tab_to_h JISX0212.1990-0 jisx0212 > jisx0212.h < JIS0212.TXT
30 * ./cjk_tab_to_h KSC5601.1987-0 ksc5601 > ksc5601.h < KSC5601.TXT
31 * ./cjk_tab_to_h KSX1001.1992-0 ksc5601 > ksc5601.h < KSX1001.TXT
33 * ./cjk_tab_to_h BIG5 big5 > big5.h < BIG5.TXT
35 * ./cjk_tab_to_h JOHAB johab > johab.h < JOHAB.TXT
49 int rows
; /* number of possible values for the 1st byte */
50 int cols
; /* number of possible values for the 2nd byte */
51 int (*row_byte
) (int row
); /* returns the 1st byte value for a given row */
52 int (*col_byte
) (int col
); /* returns the 2nd byte value for a given col */
53 int (*byte_row
) (int byte
); /* converts a 1st byte value to a row, else -1 */
54 int (*byte_col
) (int byte
); /* converts a 2nd byte value to a col, else -1 */
55 const char* check_row_expr
; /* format string for 1st byte value checking */
56 const char* check_col_expr
; /* format string for 2nd byte value checking */
57 const char* byte_row_expr
; /* format string for 1st byte value to row */
58 const char* byte_col_expr
; /* format string for 2nd byte value to col */
59 int** charset2uni
; /* charset2uni[0..rows-1][0..cols-1] is valid */
60 /* You'll understand the terms "row" and "col" when you buy Ken Lunde's book.
61 Once a row is fixed, choosing a "col" is the same as choosing a "cell". */
62 int* charsetpage
; /* charsetpage[0..rows]: how large is a page for a row */
64 Block
* charsetblocks
; /* blocks[0..nblocks-1] */
65 int* uni2charset
; /* uni2charset[0x0000..0xffff] */
66 int fffd
; /* uni representation of the invalid character */
70 * Outputs the file title.
72 static void output_title (const char *charsetname
)
75 printf(" * Copyright (C) 1999-2001 Free Software Foundation, Inc.\n");
76 printf(" * This file is part of the GNU LIBICONV Library.\n");
78 printf(" * The GNU LIBICONV Library is free software; you can redistribute it\n");
79 printf(" * and/or modify it under the terms of the GNU Library General Public\n");
80 printf(" * License as published by the Free Software Foundation; either version 2\n");
81 printf(" * of the License, or (at your option) any later version.\n");
83 printf(" * The GNU LIBICONV Library is distributed in the hope that it will be\n");
84 printf(" * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
85 printf(" * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU\n");
86 printf(" * Library General Public License for more details.\n");
88 printf(" * You should have received a copy of the GNU Library General Public\n");
89 printf(" * License along with the GNU LIBICONV Library; see the file COPYING.LIB.\n");
90 printf(" * If not, write to the Free Software Foundation, Inc., 59 Temple Place -\n");
91 printf(" * Suite 330, Boston, MA 02111-1307, USA.\n");
95 printf(" * %s\n", charsetname
);
101 * Reads the charset2uni table from standard input.
103 static void read_table (Encoding
* enc
)
105 int row
, col
, i
, i1
, i2
, c
, j
;
107 enc
->charset2uni
= (int**) malloc(enc
->rows
*sizeof(int*));
108 for (row
= 0; row
< enc
->rows
; row
++)
109 enc
->charset2uni
[row
] = (int*) malloc(enc
->cols
*sizeof(int));
111 for (row
= 0; row
< enc
->rows
; row
++)
112 for (col
= 0; col
< enc
->cols
; col
++)
113 enc
->charset2uni
[row
][col
] = 0xfffd;
118 /* Read a unicode.org style .TXT file. */
123 if (c
== '\n' || c
== ' ' || c
== '\t')
126 do { c
= getc(stdin
); } while (!(c
== EOF
|| c
== '\n'));
130 if (scanf("0x%x", &j
) != 1)
134 row
= enc
->byte_row(i1
);
135 col
= enc
->byte_col(i2
);
136 if (row
< 0 || col
< 0) {
137 fprintf(stderr
, "lost entry for %02x %02x\n", i1
, i2
);
140 if (scanf(" 0x%x", &enc
->charset2uni
[row
][col
]) != 1)
144 /* Read a table of hexadecimal Unicode values. */
145 for (i1
= 32; i1
< 132; i1
++)
146 for (i2
= 32; i2
< 132; i2
++) {
152 if (j
< 0 || j
== 0xffff)
155 if (enc
->byte_row(i1
) < 0 || enc
->byte_col(i2
) < 0) {
156 fprintf(stderr
, "lost entry at %02x %02x\n", i1
, i2
);
159 enc
->charset2uni
[enc
->byte_row(i1
)][enc
->byte_col(i2
)] = j
;
167 * Determine whether the Unicode range goes outside the BMP.
169 static bool is_charset2uni_large (Encoding
* enc
)
173 for (row
= 0; row
< enc
->rows
; row
++)
174 for (col
= 0; col
< enc
->cols
; col
++)
175 if (enc
->charset2uni
[row
][col
] >= 0x10000)
181 * Compactify the Unicode range by use of an auxiliary table,
182 * so 16 bits suffice to store each value.
184 static int compact_large_charset2uni (Encoding
* enc
, unsigned int **urows
)
187 int i
, row
, col
, nurows
;
189 for (i
= 0; i
< 0x1100; i
++)
192 for (row
= 0; row
< enc
->rows
; row
++)
193 for (col
= 0; col
< enc
->cols
; col
++)
194 upages
[enc
->charset2uni
[row
][col
] >> 8] = 0;
197 for (i
= 0; i
< 0x1100; i
++)
201 *urows
= (unsigned int *) malloc(nurows
* sizeof(unsigned int));
204 for (i
= 0; i
< 0x1100; i
++)
205 if (upages
[i
] == 0) {
207 (*urows
)[nurows
] = i
;
211 for (row
= 0; row
< enc
->rows
; row
++)
212 for (col
= 0; col
< enc
->cols
; col
++) {
213 int u
= enc
->charset2uni
[row
][col
];
214 enc
->charset2uni
[row
][col
] = (upages
[u
>> 8] << 8) | (u
& 0xFF);
216 enc
->fffd
= (upages
[0xfffd >> 8] << 8) | (0xfffd & 0xFF);
222 * Computes the charsetpage[0..rows] array.
224 static void find_charset2uni_pages (Encoding
* enc
)
228 enc
->charsetpage
= (int*) malloc((enc
->rows
+1)*sizeof(int));
230 for (row
= 0; row
<= enc
->rows
; row
++)
231 enc
->charsetpage
[row
] = 0;
233 for (row
= 0; row
< enc
->rows
; row
++) {
235 for (col
= 0; col
< enc
->cols
; col
++)
236 if (enc
->charset2uni
[row
][col
] != enc
->fffd
)
238 enc
->charsetpage
[row
] = used
;
243 * Fills in nblocks and blocks.
245 static void find_charset2uni_blocks (Encoding
* enc
)
249 enc
->charsetblocks
= (Block
*) malloc(enc
->rows
*sizeof(Block
));
252 for (row
= 0; row
< enc
->rows
; row
++)
253 if (enc
->charsetpage
[row
] > 0 && (row
== 0 || enc
->charsetpage
[row
-1] == 0)) {
254 for (lastrow
= row
; enc
->charsetpage
[lastrow
+1] > 0; lastrow
++);
255 enc
->charsetblocks
[n
].start
= row
* enc
->cols
;
256 enc
->charsetblocks
[n
].end
= lastrow
* enc
->cols
+ enc
->charsetpage
[lastrow
];
259 enc
->ncharsetblocks
= n
;
263 * Outputs the charset to unicode table and function.
265 static void output_charset2uni (const char* name
, Encoding
* enc
)
267 int nurows
, row
, col
, lastrow
, col_max
, i
, i1_min
, i1_max
;
271 is_large
= is_charset2uni_large(enc
);
273 nurows
= compact_large_charset2uni(enc
,&urows
);
275 nurows
= 0; urows
= NULL
; enc
->fffd
= 0xfffd;
278 find_charset2uni_pages(enc
);
280 find_charset2uni_blocks(enc
);
282 for (row
= 0; row
< enc
->rows
; row
++)
283 if (enc
->charsetpage
[row
] > 0) {
284 if (row
== 0 || enc
->charsetpage
[row
-1] == 0) {
285 /* Start a new block. */
286 for (lastrow
= row
; enc
->charsetpage
[lastrow
+1] > 0; lastrow
++);
287 printf("static const unsigned short %s_2uni_page%02x[%d] = {\n",
288 name
, enc
->row_byte(row
),
289 (lastrow
-row
) * enc
->cols
+ enc
->charsetpage
[lastrow
]);
291 printf(" /""* 0x%02x *""/\n ", enc
->row_byte(row
));
292 col_max
= (enc
->charsetpage
[row
+1] > 0 ? enc
->cols
: enc
->charsetpage
[row
]);
293 for (col
= 0; col
< col_max
; col
++) {
294 printf(" 0x%04x,", enc
->charset2uni
[row
][col
]);
295 if ((col
% 8) == 7 && (col
+1 < col_max
)) printf("\n ");
298 if (enc
->charsetpage
[row
+1] == 0) {
306 printf("static const ucs4_t %s_2uni_upages[%d] = {\n ", name
, nurows
);
307 for (i
= 0; i
< nurows
; i
++) {
308 printf(" 0x%05x,", urows
[i
] << 8);
309 if ((i
% 8) == 7 && (i
+1 < nurows
)) printf("\n ");
316 printf("static int\n");
317 printf("%s_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, int n)\n", name
);
319 printf(" unsigned char c1 = s[0];\n");
321 for (i
= 0; i
< enc
->ncharsetblocks
; i
++) {
322 i1_min
= enc
->row_byte(enc
->charsetblocks
[i
].start
/ enc
->cols
);
323 i1_max
= enc
->row_byte((enc
->charsetblocks
[i
].end
-1) / enc
->cols
);
326 if (i1_min
== i1_max
)
327 printf("(c1 == 0x%02x)", i1_min
);
329 printf("(c1 >= 0x%02x && c1 <= 0x%02x)", i1_min
, i1_max
);
332 printf(" if (n >= 2) {\n");
333 printf(" unsigned char c2 = s[1];\n");
335 printf(enc
->check_col_expr
, "c2");
337 printf(" unsigned int i = %d * (", enc
->cols
);
338 printf(enc
->byte_row_expr
, "c1");
340 printf(enc
->byte_col_expr
, "c2");
342 printf(" %s wc = 0xfffd;\n", is_large
? "ucs4_t" : "unsigned short");
343 if (is_large
) printf(" unsigned short swc;\n");
344 for (i
= 0; i
< enc
->ncharsetblocks
; i
++) {
348 if (i
< enc
->ncharsetblocks
-1)
349 printf("if (i < %d) ", enc
->charsetblocks
[i
+1].start
);
351 printf(" if (i < %d)\n", enc
->charsetblocks
[i
].end
);
352 printf(" %s = ", is_large
? "swc" : "wc");
353 printf("%s_2uni_page%02x[i", name
, enc
->row_byte(enc
->charsetblocks
[i
].start
/ enc
->cols
));
354 if (enc
->charsetblocks
[i
].start
> 0)
355 printf("-%d", enc
->charsetblocks
[i
].start
);
357 if (is_large
) printf(",\n wc = %s_2uni_upages[swc>>8] | (swc & 0xff)", name
);
361 printf(" if (wc != 0xfffd) {\n");
362 printf(" *pwc = %swc;\n", is_large
? "" : "(ucs4_t) ");
363 printf(" return 2;\n");
366 printf(" return RET_ILSEQ;\n");
368 printf(" return RET_TOOFEW(0);\n");
370 printf(" return RET_ILSEQ;\n");
376 * Outputs the charset to unicode table and function.
377 * (Suitable if the mapping function is well defined, i.e. has no holes, and
378 * is monotonically increasing with small gaps only.)
380 static void output_charset2uni_noholes_monotonic (const char* name
, Encoding
* enc
)
382 int row
, col
, lastrow
, r
, col_max
, i
, i1_min
, i1_max
;
384 /* Choose stepsize so that stepsize*steps_per_row >= enc->cols, and
385 enc->charset2uni[row][col] - enc->charset2uni[row][col/stepsize*stepsize]
386 is always < 0x100. */
387 int steps_per_row
= 2;
388 int stepsize
= (enc
->cols
+ steps_per_row
-1) / steps_per_row
;
390 find_charset2uni_pages(enc
);
392 find_charset2uni_blocks(enc
);
394 for (row
= 0; row
< enc
->rows
; row
++)
395 if (enc
->charsetpage
[row
] > 0) {
396 if (row
== 0 || enc
->charsetpage
[row
-1] == 0) {
397 /* Start a new block. */
398 for (lastrow
= row
; enc
->charsetpage
[lastrow
+1] > 0; lastrow
++);
399 printf("static const unsigned short %s_2uni_main_page%02x[%d] = {\n ",
400 name
, enc
->row_byte(row
),
401 steps_per_row
*(lastrow
-row
+1));
402 for (r
= row
; r
<= lastrow
; r
++) {
403 for (i
= 0; i
< steps_per_row
; i
++)
404 printf(" 0x%04x,", enc
->charset2uni
[r
][i
*stepsize
]);
405 if (((r
-row
) % 4) == 3 && (r
< lastrow
)) printf("\n ");
409 printf("static const unsigned char %s_2uni_page%02x[%d] = {\n",
410 name
, enc
->row_byte(row
),
411 (lastrow
-row
) * enc
->cols
+ enc
->charsetpage
[lastrow
]);
413 printf(" /""* 0x%02x *""/\n ", enc
->row_byte(row
));
414 col_max
= (enc
->charsetpage
[row
+1] > 0 ? enc
->cols
: enc
->charsetpage
[row
]);
415 for (col
= 0; col
< col_max
; col
++) {
416 printf(" 0x%02x,", enc
->charset2uni
[row
][col
] - enc
->charset2uni
[row
][col
/stepsize
*stepsize
]);
417 if ((col
% 8) == 7 && (col
+1 < col_max
)) printf("\n ");
420 if (enc
->charsetpage
[row
+1] == 0) {
427 printf("static int\n");
428 printf("%s_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, int n)\n", name
);
430 printf(" unsigned char c1 = s[0];\n");
432 for (i
= 0; i
< enc
->ncharsetblocks
; i
++) {
433 i1_min
= enc
->row_byte(enc
->charsetblocks
[i
].start
/ enc
->cols
);
434 i1_max
= enc
->row_byte((enc
->charsetblocks
[i
].end
-1) / enc
->cols
);
437 if (i1_min
== i1_max
)
438 printf("(c1 == 0x%02x)", i1_min
);
440 printf("(c1 >= 0x%02x && c1 <= 0x%02x)", i1_min
, i1_max
);
443 printf(" if (n >= 2) {\n");
444 printf(" unsigned char c2 = s[1];\n");
446 printf(enc
->check_col_expr
, "c2");
448 printf(" unsigned int row = ");
449 printf(enc
->byte_row_expr
, "c1");
451 printf(" unsigned int col = ");
452 printf(enc
->byte_col_expr
, "c2");
454 printf(" unsigned int i = %d * row + col;\n", enc
->cols
);
455 printf(" unsigned short wc = 0xfffd;\n");
456 for (i
= 0; i
< enc
->ncharsetblocks
; i
++) {
460 if (i
< enc
->ncharsetblocks
-1)
461 printf("if (i < %d) ", enc
->charsetblocks
[i
+1].start
);
463 printf(" if (i < %d)\n", enc
->charsetblocks
[i
].end
);
464 printf(" wc = %s_2uni_main_page%02x[%d*", name
, enc
->row_byte(enc
->charsetblocks
[i
].start
/ enc
->cols
), steps_per_row
);
465 if (enc
->charsetblocks
[i
].start
> 0)
466 printf("(row-%d)", enc
->charsetblocks
[i
].start
/ enc
->cols
);
470 if (steps_per_row
== 2)
471 printf("(col>=%d?1:0)", stepsize
);
473 printf("col/%d", stepsize
);
474 printf("] + %s_2uni_page%02x[i", name
, enc
->row_byte(enc
->charsetblocks
[i
].start
/ enc
->cols
));
475 if (enc
->charsetblocks
[i
].start
> 0)
476 printf("-%d", enc
->charsetblocks
[i
].start
);
480 printf(" if (wc != 0xfffd) {\n");
481 printf(" *pwc = (ucs4_t) wc;\n");
482 printf(" return 2;\n");
485 printf(" return RET_ILSEQ;\n");
487 printf(" return RET_TOOFEW(0);\n");
489 printf(" return RET_ILSEQ;\n");
495 * Computes the uni2charset[0x0000..0x2ffff] array.
497 static void invert (Encoding
* enc
)
501 enc
->uni2charset
= (int*) malloc(0x30000*sizeof(int));
503 for (j
= 0; j
< 0x30000; j
++)
504 enc
->uni2charset
[j
] = 0;
506 for (row
= 0; row
< enc
->rows
; row
++)
507 for (col
= 0; col
< enc
->cols
; col
++) {
508 j
= enc
->charset2uni
[row
][col
];
510 enc
->uni2charset
[j
] = 0x100 * enc
->row_byte(row
) + enc
->col_byte(col
);
515 * Outputs the unicode to charset table and function, using a linear array.
516 * (Suitable if the table is dense.)
518 static void output_uni2charset_dense (const char* name
, Encoding
* enc
)
520 /* Like in 8bit_tab_to_h.c */
524 struct { int minline
; int maxline
; int usecount
; } tables
[0x6000];
526 int row
, col
, j
, p
, j1
, j2
, t
;
528 for (p
= 0; p
< 0x300; p
++)
530 for (row
= 0; row
< enc
->rows
; row
++)
531 for (col
= 0; col
< enc
->cols
; col
++) {
532 j
= enc
->charset2uni
[row
][col
];
536 for (j1
= 0; j1
< 0x6000; j1
++) {
537 bool all_invalid
= true;
538 for (j2
= 0; j2
< 8; j2
++) {
540 if (enc
->uni2charset
[j
] != 0)
549 for (j1
= 0; j1
< 0x6000; j1
++) {
552 && ((j1
> 0 && line
[j1
-1] == tableno
-1)
553 || ((tables
[tableno
-1].maxline
>> 5) == (j1
>> 5)
554 && j1
- tables
[tableno
-1].maxline
<= 8))) {
555 line
[j1
] = tableno
-1;
556 tables
[tableno
-1].maxline
= j1
;
559 line
[j1
] = tableno
-1;
560 tables
[tableno
-1].minline
= tables
[tableno
-1].maxline
= j1
;
564 for (t
= 0; t
< tableno
; t
++) {
565 tables
[t
].usecount
= 0;
566 j1
= 8*tables
[t
].minline
;
567 j2
= 8*(tables
[t
].maxline
+1);
568 for (j
= j1
; j
< j2
; j
++)
569 if (enc
->uni2charset
[j
] != 0)
570 tables
[t
].usecount
++;
574 for (t
= 0; t
< tableno
; t
++)
575 if (tables
[t
].usecount
> 1) {
576 p
= tables
[t
].minline
>> 5;
577 printf("static const unsigned short %s_page%02x[%d] = {\n", name
, p
, 8*(tables
[t
].maxline
-tables
[t
].minline
+1));
578 for (j1
= tables
[t
].minline
; j1
<= tables
[t
].maxline
; j1
++) {
579 if ((j1
% 0x20) == 0 && j1
> tables
[t
].minline
)
580 printf(" /* 0x%04x */\n", 8*j1
);
582 for (j2
= 0; j2
< 8; j2
++) {
584 printf(" 0x%04x,", enc
->uni2charset
[j
]);
586 printf(" /*0x%02x-0x%02x*/\n", 8*(j1
% 0x20), 8*(j1
% 0x20)+7);
593 printf("static int\n%s_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, int n)\n", name
);
595 printf(" if (n >= 2) {\n");
596 printf(" unsigned short c = 0;\n");
598 for (j1
= 0; j1
< 0x6000;) {
600 for (j2
= j1
; j2
< 0x6000 && line
[j2
] == t
; j2
++);
602 if (j1
!= tables
[t
].minline
) abort();
603 if (j2
> tables
[t
].maxline
+1) abort();
604 j2
= tables
[t
].maxline
+1;
610 if (tables
[t
].usecount
== 0) abort();
611 if (tables
[t
].usecount
== 1) {
612 if (j2
!= j1
+1) abort();
613 for (j
= 8*j1
; j
< 8*j2
; j
++)
614 if (enc
->uni2charset
[j
] != 0) {
615 printf("if (wc == 0x%04x)\n c = 0x%02x;\n", j
, enc
->uni2charset
[j
]);
620 printf("if (wc < 0x%04x)", 8*j2
);
622 printf("if (wc >= 0x%04x && wc < 0x%04x)", 8*j1
, 8*j2
);
624 printf("\n c = %s_page%02x[wc", name
, j1
>> 5);
625 if (tables
[t
].minline
> 0)
626 printf("-0x%04x", 8*j1
);
632 printf(" if (c != 0) {\n");
633 printf(" r[0] = (c >> 8); r[1] = (c & 0xff);\n");
634 printf(" return 2;\n");
636 printf(" return RET_ILUNI;\n");
638 printf(" return RET_TOOSMALL;\n");
643 * Outputs the unicode to charset table and function, using a packed array.
644 * (Suitable if the table is sparse.)
645 * The argument 'monotonic' may be set to true if the mapping is monotonically
646 * increasing with small gaps only.
648 static void output_uni2charset_sparse (const char* name
, Encoding
* enc
, bool monotonic
)
651 Block pageblocks
[0x300]; int npageblocks
;
652 int indx2charset
[0x30000];
653 int summary_indx
[0x3000];
654 int summary_used
[0x3000];
655 int i
, row
, col
, j
, p
, j1
, j2
, indx
;
658 int log2_stepsize
= (!strcmp(name
,"uhc_2") ? 6 : 7);
659 int stepsize
= 1 << log2_stepsize
;
662 /* Fill pages[0x300]. */
663 for (p
= 0; p
< 0x300; p
++)
665 for (row
= 0; row
< enc
->rows
; row
++)
666 for (col
= 0; col
< enc
->cols
; col
++) {
667 j
= enc
->charset2uni
[row
][col
];
672 /* Determine whether two or three bytes are needed for each character. */
674 for (j
= 0; j
< 0x30000; j
++)
675 if (enc
->uni2charset
[j
] >= 0x10000)
679 for (p
= 0; p
< 0x300; p
++)
681 printf("static const unsigned short %s_page%02x[256] = {\n", name
, p
);
682 for (j1
= 0; j1
< 32; j1
++) {
684 for (j2
= 0; j2
< 8; j2
++)
685 printf("0x%04x, ", enc
->uni2charset
[256*p
+8*j1
+j2
]);
686 printf("/""*0x%02x-0x%02x*""/\n", 8*j1
, 8*j1
+7);
693 /* Fill summary_indx[] and summary_used[]. */
695 for (j1
= 0; j1
< 0x3000; j1
++) {
696 summary_indx
[j1
] = indx
;
697 summary_used
[j1
] = 0;
698 for (j2
= 0; j2
< 16; j2
++) {
700 if (enc
->uni2charset
[j
] != 0) {
701 indx2charset
[indx
++] = enc
->uni2charset
[j
];
702 summary_used
[j1
] |= (1 << j2
);
707 /* Fill npageblocks and pageblocks[]. */
709 for (p
= 0; p
< 0x300; ) {
710 if (pages
[p
] && (p
== 0 || !pages
[p
-1])) {
711 pageblocks
[npageblocks
].start
= 16*p
;
712 do p
++; while (p
< 0x300 && pages
[p
]);
714 while (summary_used
[j1
-1] == 0) j1
--;
715 pageblocks
[npageblocks
].end
= j1
;
722 indxsteps
= (indx
+ stepsize
-1) / stepsize
;
723 printf("static const unsigned short %s_2charset_main[%d] = {\n", name
, indxsteps
);
724 for (i
= 0; i
< indxsteps
; ) {
725 if ((i
% 8) == 0) printf(" ");
726 printf(" 0x%04x,", indx2charset
[i
*stepsize
]);
728 if ((i
% 8) == 0 || i
== indxsteps
) printf("\n");
731 printf("static const unsigned char %s_2charset[%d] = {\n", name
, indx
);
732 for (i
= 0; i
< indx
; ) {
733 if ((i
% 8) == 0) printf(" ");
734 printf(" 0x%02x,", indx2charset
[i
] - indx2charset
[i
/stepsize
*stepsize
]);
736 if ((i
% 8) == 0 || i
== indx
) printf("\n");
741 printf("static const unsigned char %s_2charset[3*%d] = {\n", name
, indx
);
742 for (i
= 0; i
< indx
; ) {
743 if ((i
% 4) == 0) printf(" ");
744 printf(" 0x%1x,0x%02x,0x%02x,", indx2charset
[i
] >> 16,
745 (indx2charset
[i
] >> 8) & 0xff, indx2charset
[i
] & 0xff);
747 if ((i
% 4) == 0 || i
== indx
) printf("\n");
751 printf("static const unsigned short %s_2charset[%d] = {\n", name
, indx
);
752 for (i
= 0; i
< indx
; ) {
753 if ((i
% 8) == 0) printf(" ");
754 printf(" 0x%04x,", indx2charset
[i
]);
756 if ((i
% 8) == 0 || i
== indx
) printf("\n");
762 for (i
= 0; i
< npageblocks
; i
++) {
763 printf("static const Summary16 %s_uni2indx_page%02x[%d] = {\n", name
,
764 pageblocks
[i
].start
/16, pageblocks
[i
].end
-pageblocks
[i
].start
);
765 for (j1
= pageblocks
[i
].start
; j1
< pageblocks
[i
].end
; ) {
766 if (((16*j1
) % 0x100) == 0) printf(" /""* 0x%04x *""/\n", 16*j1
);
767 if ((j1
% 4) == 0) printf(" ");
768 printf(" { %4d, 0x%04x },", summary_indx
[j1
], summary_used
[j1
]);
770 if ((j1
% 4) == 0 || j1
== pageblocks
[i
].end
) printf("\n");
776 printf("static int\n");
777 printf("%s_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, int n)\n", name
);
779 printf(" if (n >= 2) {\n");
780 printf(" const Summary16 *summary = NULL;\n");
781 for (i
= 0; i
< npageblocks
; i
++) {
785 printf("if (wc >= 0x%04x && wc < 0x%04x)\n",
786 16*pageblocks
[i
].start
, 16*pageblocks
[i
].end
);
787 printf(" summary = &%s_uni2indx_page%02x[(wc>>4)", name
,
788 pageblocks
[i
].start
/16);
789 if (pageblocks
[i
].start
> 0)
790 printf("-0x%03x", pageblocks
[i
].start
);
793 printf(" if (summary) {\n");
794 printf(" unsigned short used = summary->used;\n");
795 printf(" unsigned int i = wc & 0x0f;\n");
796 printf(" if (used & ((unsigned short) 1 << i)) {\n");
797 if (monotonic
|| !is_large
)
798 printf(" unsigned short c;\n");
799 printf(" /* Keep in `used' only the bits 0..i-1. */\n");
800 printf(" used &= ((unsigned short) 1 << i) - 1;\n");
801 printf(" /* Add `summary->indx' and the number of bits set in `used'. */\n");
802 printf(" used = (used & 0x5555) + ((used & 0xaaaa) >> 1);\n");
803 printf(" used = (used & 0x3333) + ((used & 0xcccc) >> 2);\n");
804 printf(" used = (used & 0x0f0f) + ((used & 0xf0f0) >> 4);\n");
805 printf(" used = (used & 0x00ff) + (used >> 8);\n");
807 printf(" used += summary->indx;\n");
808 printf(" c = %s_2charset_main[used>>%d] + %s_2charset[used];\n", name
, log2_stepsize
, name
);
809 printf(" r[0] = (c >> 8); r[1] = (c & 0xff);\n");
810 printf(" return 2;\n");
813 printf(" used += summary->indx;\n");
814 printf(" r[0] = %s_2charset[3*used];\n", name
);
815 printf(" r[1] = %s_2charset[3*used+1];\n", name
);
816 printf(" r[2] = %s_2charset[3*used+2];\n", name
);
817 printf(" return 3;\n");
819 printf(" c = %s_2charset[summary->indx + used];\n", name
);
820 printf(" r[0] = (c >> 8); r[1] = (c & 0xff);\n");
821 printf(" return 2;\n");
826 printf(" return RET_ILUNI;\n");
828 printf(" return RET_TOOSMALL;\n");
832 /* ISO-2022/EUC specifics */
834 static int row_byte_normal (int row
) { return 0x21+row
; }
835 static int col_byte_normal (int col
) { return 0x21+col
; }
836 static int byte_row_normal (int byte
) { return byte
-0x21; }
837 static int byte_col_normal (int byte
) { return byte
-0x21; }
839 static void do_normal (const char* name
)
845 enc
.row_byte
= row_byte_normal
;
846 enc
.col_byte
= col_byte_normal
;
847 enc
.byte_row
= byte_row_normal
;
848 enc
.byte_col
= byte_col_normal
;
849 enc
.check_row_expr
= "%1$s >= 0x21 && %1$s < 0x7f";
850 enc
.check_col_expr
= "%1$s >= 0x21 && %1$s < 0x7f";
851 enc
.byte_row_expr
= "%1$s - 0x21";
852 enc
.byte_col_expr
= "%1$s - 0x21";
855 output_charset2uni(name
,&enc
);
856 invert(&enc
); output_uni2charset_sparse(name
,&enc
,false);
859 /* Note: On first sight, the jisx0212_2charset[] table seems to be in order,
860 starting from the charset=0x3021/uni=0x4e02 pair. But it's only mostly in
861 order. There are 75 out-of-order values, scattered all throughout the table.
864 static void do_normal_only_charset2uni (const char* name
)
870 enc
.row_byte
= row_byte_normal
;
871 enc
.col_byte
= col_byte_normal
;
872 enc
.byte_row
= byte_row_normal
;
873 enc
.byte_col
= byte_col_normal
;
874 enc
.check_row_expr
= "%1$s >= 0x21 && %1$s < 0x7f";
875 enc
.check_col_expr
= "%1$s >= 0x21 && %1$s < 0x7f";
876 enc
.byte_row_expr
= "%1$s - 0x21";
877 enc
.byte_col_expr
= "%1$s - 0x21";
880 output_charset2uni(name
,&enc
);
883 /* CNS 11643 specifics - trick to put two tables into one */
885 static int row_byte_cns11643 (int row
) {
886 return 0x100 * (row
/ 94) + (row
% 94) + 0x21;
888 static int byte_row_cns11643 (int byte
) {
889 return (byte
>> 8) * 94 + (byte
& 0xff) - 0x21;
892 static void do_cns11643_only_uni2charset (const char* name
)
898 enc
.row_byte
= row_byte_cns11643
;
899 enc
.col_byte
= col_byte_normal
;
900 enc
.byte_row
= byte_row_cns11643
;
901 enc
.byte_col
= byte_col_normal
;
902 enc
.check_row_expr
= "%1$s >= 0x21 && %1$s < 0x7f";
903 enc
.check_col_expr
= "%1$s >= 0x21 && %1$s < 0x7f";
904 enc
.byte_row_expr
= "%1$s - 0x21";
905 enc
.byte_col_expr
= "%1$s - 0x21";
909 output_uni2charset_sparse(name
,&enc
,false);
914 static int row_byte_gbk1 (int row
) {
917 static int col_byte_gbk1 (int col
) {
918 return (col
>= 0x3f ? 0x41 : 0x40) + col
;
920 static int byte_row_gbk1 (int byte
) {
921 if (byte
>= 0x81 && byte
< 0xff)
926 static int byte_col_gbk1 (int byte
) {
927 if (byte
>= 0x40 && byte
< 0x7f)
929 else if (byte
>= 0x80 && byte
< 0xff)
935 static void do_gbk1 (const char* name
)
941 enc
.row_byte
= row_byte_gbk1
;
942 enc
.col_byte
= col_byte_gbk1
;
943 enc
.byte_row
= byte_row_gbk1
;
944 enc
.byte_col
= byte_col_gbk1
;
945 enc
.check_row_expr
= "%1$s >= 0x81 && %1$s < 0xff";
946 enc
.check_col_expr
= "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0x80 && %1$s < 0xff)";
947 enc
.byte_row_expr
= "%1$s - 0x81";
948 enc
.byte_col_expr
= "%1$s - (%1$s >= 0x80 ? 0x41 : 0x40)";
951 output_charset2uni(name
,&enc
);
952 invert(&enc
); output_uni2charset_dense(name
,&enc
);
955 static void do_gbk1_only_charset2uni (const char* name
)
961 enc
.row_byte
= row_byte_gbk1
;
962 enc
.col_byte
= col_byte_gbk1
;
963 enc
.byte_row
= byte_row_gbk1
;
964 enc
.byte_col
= byte_col_gbk1
;
965 enc
.check_row_expr
= "%1$s >= 0x81 && %1$s < 0xff";
966 enc
.check_col_expr
= "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0x80 && %1$s < 0xff)";
967 enc
.byte_row_expr
= "%1$s - 0x81";
968 enc
.byte_col_expr
= "%1$s - (%1$s >= 0x80 ? 0x41 : 0x40)";
971 output_charset2uni(name
,&enc
);
974 static int row_byte_gbk2 (int row
) {
977 static int col_byte_gbk2 (int col
) {
978 return (col
>= 0x3f ? 0x41 : 0x40) + col
;
980 static int byte_row_gbk2 (int byte
) {
981 if (byte
>= 0x81 && byte
< 0xff)
986 static int byte_col_gbk2 (int byte
) {
987 if (byte
>= 0x40 && byte
< 0x7f)
989 else if (byte
>= 0x80 && byte
< 0xa1)
995 static void do_gbk2_only_charset2uni (const char* name
)
1001 enc
.row_byte
= row_byte_gbk2
;
1002 enc
.col_byte
= col_byte_gbk2
;
1003 enc
.byte_row
= byte_row_gbk2
;
1004 enc
.byte_col
= byte_col_gbk2
;
1005 enc
.check_row_expr
= "%1$s >= 0x81 && %1$s < 0xff";
1006 enc
.check_col_expr
= "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0x80 && %1$s < 0xa1)";
1007 enc
.byte_row_expr
= "%1$s - 0x81";
1008 enc
.byte_col_expr
= "%1$s - (%1$s >= 0x80 ? 0x41 : 0x40)";
1011 output_charset2uni(name
,&enc
);
1014 static void do_gbk1_only_uni2charset (const char* name
)
1020 enc
.row_byte
= row_byte_gbk1
;
1021 enc
.col_byte
= col_byte_gbk1
;
1022 enc
.byte_row
= byte_row_gbk1
;
1023 enc
.byte_col
= byte_col_gbk1
;
1024 enc
.check_row_expr
= "%1$s >= 0x81 && %1$s < 0xff";
1025 enc
.check_col_expr
= "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0x80 && %1$s < 0xff)";
1026 enc
.byte_row_expr
= "%1$s - 0x81";
1027 enc
.byte_col_expr
= "%1$s - (%1$s >= 0x80 ? 0x41 : 0x40)";
1030 invert(&enc
); output_uni2charset_sparse(name
,&enc
,false);
1033 /* KSC 5601 specifics */
1036 * Reads the charset2uni table from standard input.
1038 static void read_table_ksc5601 (Encoding
* enc
)
1040 int row
, col
, i
, i1
, i2
, c
, j
;
1042 enc
->charset2uni
= (int**) malloc(enc
->rows
*sizeof(int*));
1043 for (row
= 0; row
< enc
->rows
; row
++)
1044 enc
->charset2uni
[row
] = (int*) malloc(enc
->cols
*sizeof(int));
1046 for (row
= 0; row
< enc
->rows
; row
++)
1047 for (col
= 0; col
< enc
->cols
; col
++)
1048 enc
->charset2uni
[row
][col
] = 0xfffd;
1053 /* Read a unicode.org style .TXT file. */
1058 if (c
== '\n' || c
== ' ' || c
== '\t')
1061 do { c
= getc(stdin
); } while (!(c
== EOF
|| c
== '\n'));
1065 if (scanf("0x%x", &j
) != 1)
1069 if (scanf(" 0x%x", &j
) != 1)
1071 /* Take only the range covered by KS C 5601.1987-0 = KS C 5601.1989-0
1072 = KS X 1001.1992, ignore the rest. */
1073 if (!(i1
>= 128+33 && i1
< 128+127 && i2
>= 128+33 && i2
< 128+127))
1074 continue; /* KSC5601 specific */
1075 i1
&= 0x7f; /* KSC5601 specific */
1076 i2
&= 0x7f; /* KSC5601 specific */
1077 row
= enc
->byte_row(i1
);
1078 col
= enc
->byte_col(i2
);
1079 if (row
< 0 || col
< 0) {
1080 fprintf(stderr
, "lost entry for %02x %02x\n", i1
, i2
);
1083 enc
->charset2uni
[row
][col
] = j
;
1086 /* Read a table of hexadecimal Unicode values. */
1087 for (i1
= 33; i1
< 127; i1
++)
1088 for (i2
= 33; i2
< 127; i2
++) {
1089 i
= scanf("%x", &j
);
1094 if (j
< 0 || j
== 0xffff)
1097 if (enc
->byte_row(i1
) < 0 || enc
->byte_col(i2
) < 0) {
1098 fprintf(stderr
, "lost entry at %02x %02x\n", i1
, i2
);
1101 enc
->charset2uni
[enc
->byte_row(i1
)][enc
->byte_col(i2
)] = j
;
1108 static void do_ksc5601 (const char* name
)
1114 enc
.row_byte
= row_byte_normal
;
1115 enc
.col_byte
= col_byte_normal
;
1116 enc
.byte_row
= byte_row_normal
;
1117 enc
.byte_col
= byte_col_normal
;
1118 enc
.check_row_expr
= "%1$s >= 0x21 && %1$s < 0x7f";
1119 enc
.check_col_expr
= "%1$s >= 0x21 && %1$s < 0x7f";
1120 enc
.byte_row_expr
= "%1$s - 0x21";
1121 enc
.byte_col_expr
= "%1$s - 0x21";
1123 read_table_ksc5601(&enc
);
1124 output_charset2uni(name
,&enc
);
1125 invert(&enc
); output_uni2charset_sparse(name
,&enc
,false);
1130 /* UHC part 1: 0x{81..A0}{41..5A,61..7A,81..FE} */
1132 static int row_byte_uhc_1 (int row
) {
1135 static int col_byte_uhc_1 (int col
) {
1136 return (col
>= 0x34 ? 0x4d : col
>= 0x1a ? 0x47 : 0x41) + col
;
1138 static int byte_row_uhc_1 (int byte
) {
1139 if (byte
>= 0x81 && byte
< 0xa1)
1144 static int byte_col_uhc_1 (int byte
) {
1145 if (byte
>= 0x41 && byte
< 0x5b)
1147 else if (byte
>= 0x61 && byte
< 0x7b)
1149 else if (byte
>= 0x81 && byte
< 0xff)
1155 static void do_uhc_1 (const char* name
)
1161 enc
.row_byte
= row_byte_uhc_1
;
1162 enc
.col_byte
= col_byte_uhc_1
;
1163 enc
.byte_row
= byte_row_uhc_1
;
1164 enc
.byte_col
= byte_col_uhc_1
;
1165 enc
.check_row_expr
= "(%1$s >= 0x81 && %1$s < 0xa1)";
1166 enc
.check_col_expr
= "(%1$s >= 0x41 && %1$s < 0x5b) || (%1$s >= 0x61 && %1$s < 0x7b) || (%1$s >= 0x81 && %1$s < 0xff)";
1167 enc
.byte_row_expr
= "%1$s - 0x81";
1168 enc
.byte_col_expr
= "%1$s - (%1$s >= 0x81 ? 0x4d : %1$s >= 0x61 ? 0x47 : 0x41)";
1171 output_charset2uni_noholes_monotonic(name
,&enc
);
1172 invert(&enc
); output_uni2charset_sparse(name
,&enc
,true);
1175 /* UHC part 2: 0x{A1..C6}{41..5A,61..7A,81..A0} */
1177 static int row_byte_uhc_2 (int row
) {
1180 static int col_byte_uhc_2 (int col
) {
1181 return (col
>= 0x34 ? 0x4d : col
>= 0x1a ? 0x47 : 0x41) + col
;
1183 static int byte_row_uhc_2 (int byte
) {
1184 if (byte
>= 0xa1 && byte
< 0xff)
1189 static int byte_col_uhc_2 (int byte
) {
1190 if (byte
>= 0x41 && byte
< 0x5b)
1192 else if (byte
>= 0x61 && byte
< 0x7b)
1194 else if (byte
>= 0x81 && byte
< 0xa1)
1200 static void do_uhc_2 (const char* name
)
1206 enc
.row_byte
= row_byte_uhc_2
;
1207 enc
.col_byte
= col_byte_uhc_2
;
1208 enc
.byte_row
= byte_row_uhc_2
;
1209 enc
.byte_col
= byte_col_uhc_2
;
1210 enc
.check_row_expr
= "(%1$s >= 0xa1 && %1$s < 0xff)";
1211 enc
.check_col_expr
= "(%1$s >= 0x41 && %1$s < 0x5b) || (%1$s >= 0x61 && %1$s < 0x7b) || (%1$s >= 0x81 && %1$s < 0xa1)";
1212 enc
.byte_row_expr
= "%1$s - 0xa1";
1213 enc
.byte_col_expr
= "%1$s - (%1$s >= 0x81 ? 0x4d : %1$s >= 0x61 ? 0x47 : 0x41)";
1216 output_charset2uni_noholes_monotonic(name
,&enc
);
1217 invert(&enc
); output_uni2charset_sparse(name
,&enc
,true);
1220 /* Big5 specifics */
1222 static int row_byte_big5 (int row
) {
1225 static int col_byte_big5 (int col
) {
1226 return (col
>= 0x3f ? 0x62 : 0x40) + col
;
1228 static int byte_row_big5 (int byte
) {
1229 if (byte
>= 0xa1 && byte
< 0xff)
1234 static int byte_col_big5 (int byte
) {
1235 if (byte
>= 0x40 && byte
< 0x7f)
1237 else if (byte
>= 0xa1 && byte
< 0xff)
1243 static void do_big5 (const char* name
)
1249 enc
.row_byte
= row_byte_big5
;
1250 enc
.col_byte
= col_byte_big5
;
1251 enc
.byte_row
= byte_row_big5
;
1252 enc
.byte_col
= byte_col_big5
;
1253 enc
.check_row_expr
= "%1$s >= 0xa1 && %1$s < 0xff";
1254 enc
.check_col_expr
= "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0xa1 && %1$s < 0xff)";
1255 enc
.byte_row_expr
= "%1$s - 0xa1";
1256 enc
.byte_col_expr
= "%1$s - (%1$s >= 0xa1 ? 0x62 : 0x40)";
1259 output_charset2uni(name
,&enc
);
1260 invert(&enc
); output_uni2charset_sparse(name
,&enc
,false);
1263 /* HKSCS specifics */
1265 static int row_byte_hkscs (int row
) {
1268 static int byte_row_hkscs (int byte
) {
1269 if (byte
>= 0x80 && byte
< 0xff)
1275 static void do_hkscs (const char* name
)
1281 enc
.row_byte
= row_byte_hkscs
;
1282 enc
.col_byte
= col_byte_big5
;
1283 enc
.byte_row
= byte_row_hkscs
;
1284 enc
.byte_col
= byte_col_big5
;
1285 enc
.check_row_expr
= "%1$s >= 0x80 && %1$s < 0xff";
1286 enc
.check_col_expr
= "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0xa1 && %1$s < 0xff)";
1287 enc
.byte_row_expr
= "%1$s - 0x80";
1288 enc
.byte_col_expr
= "%1$s - (%1$s >= 0xa1 ? 0x62 : 0x40)";
1291 output_charset2uni(name
,&enc
);
1292 invert(&enc
); output_uni2charset_sparse(name
,&enc
,false);
1295 /* Johab Hangul specifics */
1297 static int row_byte_johab_hangul (int row
) {
1300 static int col_byte_johab_hangul (int col
) {
1301 return (col
>= 0x3e ? 0x43 : 0x41) + col
;
1303 static int byte_row_johab_hangul (int byte
) {
1304 if (byte
>= 0x84 && byte
< 0xd4)
1309 static int byte_col_johab_hangul (int byte
) {
1310 if (byte
>= 0x41 && byte
< 0x7f)
1312 else if (byte
>= 0x81 && byte
< 0xff)
1318 static void do_johab_hangul (const char* name
)
1324 enc
.row_byte
= row_byte_johab_hangul
;
1325 enc
.col_byte
= col_byte_johab_hangul
;
1326 enc
.byte_row
= byte_row_johab_hangul
;
1327 enc
.byte_col
= byte_col_johab_hangul
;
1328 enc
.check_row_expr
= "%1$s >= 0x84 && %1$s < 0xd4";
1329 enc
.check_col_expr
= "(%1$s >= 0x41 && %1$s < 0x7f) || (%1$s >= 0x81 && %1$s < 0xff)";
1330 enc
.byte_row_expr
= "%1$s - 0x84";
1331 enc
.byte_col_expr
= "%1$s - (%1$s >= 0x81 ? 0x43 : 0x41)";
1334 output_charset2uni(name
,&enc
);
1335 invert(&enc
); output_uni2charset_dense(name
,&enc
);
1338 /* SJIS specifics */
1340 static int row_byte_sjis (int row
) {
1341 return (row
>= 0x1f ? 0xc1 : 0x81) + row
;
1343 static int col_byte_sjis (int col
) {
1344 return (col
>= 0x3f ? 0x41 : 0x40) + col
;
1346 static int byte_row_sjis (int byte
) {
1347 if (byte
>= 0x81 && byte
< 0xa0)
1349 else if (byte
>= 0xe0)
1354 static int byte_col_sjis (int byte
) {
1355 if (byte
>= 0x40 && byte
< 0x7f)
1357 else if (byte
>= 0x80 && byte
< 0xfd)
1363 static void do_sjis (const char* name
)
1369 enc
.row_byte
= row_byte_sjis
;
1370 enc
.col_byte
= col_byte_sjis
;
1371 enc
.byte_row
= byte_row_sjis
;
1372 enc
.byte_col
= byte_col_sjis
;
1373 enc
.check_row_expr
= "(%1$s >= 0x81 && %1$s < 0xa0) || (%1$s >= 0xe0)";
1374 enc
.check_col_expr
= "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0x80 && %1$s < 0xfd)";
1375 enc
.byte_row_expr
= "%1$s - (%1$s >= 0xe0 ? 0xc1 : 0x81)";
1376 enc
.byte_col_expr
= "%1$s - (%1$s >= 0x80 ? 0x41 : 0x40)";
1379 output_charset2uni(name
,&enc
);
1380 invert(&enc
); output_uni2charset_sparse(name
,&enc
,false);
1383 /* GB18030 Unicode specifics */
1385 static void do_gb18030uni (const char* name
)
1389 int i1
, i2
, i3
, i4
, i
, j
, k
;
1390 int charset2uni
[4*10*126*10];
1391 int uni2charset
[0x10000];
1392 struct { int low
; int high
; int diff
; int total
; } ranges
[256];
1393 int ranges_count
, ranges_total
;
1395 for (i
= 0; i
< 4*10*126*10; i
++)
1397 for (j
= 0; j
< 0x10000; j
++)
1400 /* Read a unicode.org style .TXT file. */
1405 if (c
== '\n' || c
== ' ' || c
== '\t')
1408 do { c
= getc(stdin
); } while (!(c
== EOF
|| c
== '\n'));
1412 if (scanf("0x%x", &bytes
) != 1)
1414 i1
= (bytes
>> 24) & 0xff;
1415 i2
= (bytes
>> 16) & 0xff;
1416 i3
= (bytes
>> 8) & 0xff;
1418 if (!(i1
>= 0x81 && i1
<= 0x84
1419 && i2
>= 0x30 && i2
<= 0x39
1420 && i3
>= 0x81 && i3
<= 0xfe
1421 && i4
>= 0x30 && i4
<= 0x39)) {
1422 fprintf(stderr
, "lost entry for %02x %02x %02x %02x\n", i1
, i2
, i3
, i4
);
1425 i
= (((i1
-0x81) * 10 + (i2
-0x30)) * 126 + (i3
-0x81)) * 10 + (i4
-0x30);
1426 if (scanf(" 0x%x", &j
) != 1)
1428 if (!(j
>= 0 && j
< 0x10000))
1434 /* Verify that the mapping i -> j is monotonically increasing and
1436 low[k] <= i <= high[k] => j = diff[k] + i
1437 with a set of disjoint intervals (low[k], high[k]). */
1439 for (i
= 0; i
< 4*10*126*10; i
++)
1440 if (charset2uni
[i
] != 0) {
1444 if (ranges_count
> 0) {
1445 if (!(i
> ranges
[ranges_count
-1].high
))
1447 if (!(j
> ranges
[ranges_count
-1].high
+ ranges
[ranges_count
-1].diff
))
1449 /* Additional property: The diffs are also increasing. */
1450 if (!(diff
>= ranges
[ranges_count
-1].diff
))
1453 if (ranges_count
> 0 && diff
== ranges
[ranges_count
-1].diff
)
1454 ranges
[ranges_count
-1].high
= i
;
1456 if (ranges_count
== 256)
1458 ranges
[ranges_count
].low
= i
;
1459 ranges
[ranges_count
].high
= i
;
1460 ranges
[ranges_count
].diff
= diff
;
1465 /* Determine size of bitmap. */
1467 for (k
= 0; k
< ranges_count
; k
++) {
1468 ranges
[k
].total
= ranges_total
;
1469 ranges_total
+= ranges
[k
].high
- ranges
[k
].low
+ 1;
1472 printf("static const unsigned short %s_charset2uni_ranges[%d] = {\n", name
, 2*ranges_count
);
1473 for (k
= 0; k
< ranges_count
; k
++) {
1474 printf(" 0x%04x, 0x%04x", ranges
[k
].low
, ranges
[k
].high
);
1475 if (k
+1 < ranges_count
) printf(",");
1476 if ((k
% 4) == 3 && k
+1 < ranges_count
) printf("\n");
1483 printf("static const unsigned short %s_uni2charset_ranges[%d] = {\n", name
, 2*ranges_count
);
1484 for (k
= 0; k
< ranges_count
; k
++) {
1485 printf(" 0x%04x, 0x%04x", ranges
[k
].low
+ ranges
[k
].diff
, ranges
[k
].high
+ ranges
[k
].diff
);
1486 if (k
+1 < ranges_count
) printf(",");
1487 if ((k
% 4) == 3 && k
+1 < ranges_count
) printf("\n");
1494 printf("static const struct { unsigned short diff; unsigned short bitmap_offset; } %s_ranges[%d] = {\n ", name
, ranges_count
);
1495 for (k
= 0; k
< ranges_count
; k
++) {
1496 printf(" { %5d, 0x%04x }", ranges
[k
].diff
, ranges
[k
].total
);
1497 if (k
+1 < ranges_count
) printf(",");
1498 if ((k
% 4) == 3 && k
+1 < ranges_count
) printf("\n ");
1505 printf("static const unsigned char %s_bitmap[%d] = {\n ", name
, (ranges_total
+ 7) / 8);
1508 for (k
= 0; k
< ranges_count
; k
++) {
1509 for (i
= ranges
[k
].total
; i
<= ranges
[k
].total
+ (ranges
[k
].high
- ranges
[k
].low
);) {
1510 if (charset2uni
[i
- ranges
[k
].total
+ ranges
[k
].low
] != 0)
1511 accu
|= (1 << (i
% 8));
1514 printf(" 0x%02x", accu
);
1515 if ((i
/ 8) < (ranges_total
+ 7) / 8) printf(",");
1516 if (((i
/ 8) % 12) == 0)
1521 if (i
!= (k
+1 < ranges_count
? ranges
[k
+1].total
: ranges_total
)) abort();
1523 if ((ranges_total
% 8) != 0)
1524 printf(" 0x%02x", accu
);
1531 printf("static int\n");
1532 printf("%s_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, int n)\n", name
);
1534 printf(" unsigned char c1 = s[0];\n");
1535 printf(" if (c1 >= 0x81 && c1 <= 0x84) {\n");
1536 printf(" if (n >= 2) {\n");
1537 printf(" unsigned char c2 = s[1];\n");
1538 printf(" if (c2 >= 0x30 && c2 <= 0x39) {\n");
1539 printf(" if (n >= 3) {\n");
1540 printf(" unsigned char c3 = s[2];\n");
1541 printf(" if (c3 >= 0x81 && c3 <= 0xfe) {\n");
1542 printf(" if (n >= 4) {\n");
1543 printf(" unsigned char c4 = s[3];\n");
1544 printf(" if (c4 >= 0x30 && c4 <= 0x39) {\n");
1545 printf(" unsigned int i = (((c1 - 0x81) * 10 + (c2 - 0x30)) * 126 + (c3 - 0x81)) * 10 + (c4 - 0x30);\n");
1546 printf(" if (i >= %d && i <= %d) {\n", ranges
[0].low
, ranges
[ranges_count
-1].high
);
1547 printf(" unsigned int k1 = 0;\n");
1548 printf(" unsigned int k2 = %d;\n", ranges_count
-1);
1549 printf(" while (k1 < k2) {\n");
1550 printf(" unsigned int k = (k1 + k2) / 2;\n");
1551 printf(" if (i <= %s_charset2uni_ranges[2*k+1])\n", name
);
1552 printf(" k2 = k;\n");
1553 printf(" else if (i >= %s_charset2uni_ranges[2*k+2])\n", name
);
1554 printf(" k1 = k + 1;\n");
1556 printf(" return RET_ILSEQ;\n");
1559 printf(" unsigned int bitmap_index = i - %s_charset2uni_ranges[2*k1] + %s_ranges[k1].bitmap_offset;\n", name
, name
);
1560 printf(" if ((%s_bitmap[bitmap_index >> 3] >> (bitmap_index & 7)) & 1) {\n", name
);
1561 printf(" unsigned int diff = %s_ranges[k1].diff;\n", name
);
1562 printf(" *pwc = (ucs4_t) (i + diff);\n");
1563 printf(" return 4;\n");
1568 printf(" return RET_ILSEQ;\n");
1570 printf(" return RET_TOOFEW(0);\n");
1572 printf(" return RET_ILSEQ;\n");
1574 printf(" return RET_TOOFEW(0);\n");
1576 printf(" return RET_ILSEQ;\n");
1578 printf(" return RET_TOOFEW(0);\n");
1580 printf(" return RET_ILSEQ;\n");
1585 printf("static int\n");
1586 printf("%s_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, int n)\n", name
);
1588 printf(" if (n >= 4) {\n");
1589 printf(" unsigned int i = wc;\n");
1590 printf(" if (i >= 0x%04x && i <= 0x%04x) {\n", ranges
[0].low
+ ranges
[0].diff
, ranges
[ranges_count
-1].high
+ ranges
[ranges_count
-1].diff
);
1591 printf(" unsigned int k1 = 0;\n");
1592 printf(" unsigned int k2 = %d;\n", ranges_count
-1);
1593 printf(" while (k1 < k2) {\n");
1594 printf(" unsigned int k = (k1 + k2) / 2;\n");
1595 printf(" if (i <= %s_uni2charset_ranges[2*k+1])\n", name
);
1596 printf(" k2 = k;\n");
1597 printf(" else if (i >= %s_uni2charset_ranges[2*k+2])\n", name
);
1598 printf(" k1 = k + 1;\n");
1600 printf(" return RET_ILUNI;\n");
1603 printf(" unsigned int bitmap_index = i - %s_uni2charset_ranges[2*k1] + %s_ranges[k1].bitmap_offset;\n", name
, name
);
1604 printf(" if ((%s_bitmap[bitmap_index >> 3] >> (bitmap_index & 7)) & 1) {\n", name
);
1605 printf(" unsigned int diff = %s_ranges[k1].diff;\n", name
);
1606 printf(" i -= diff;\n");
1607 printf(" r[3] = (i %% 10) + 0x30; i = i / 10;\n");
1608 printf(" r[2] = (i %% 126) + 0x81; i = i / 126;\n");
1609 printf(" r[1] = (i %% 10) + 0x30; i = i / 10;\n");
1610 printf(" r[0] = i + 0x81;\n");
1611 printf(" return 4;\n");
1615 printf(" return RET_ILUNI;\n");
1617 printf(" return RET_TOOSMALL;\n");
1623 int main (int argc
, char *argv
[])
1625 const char* charsetname
;
1630 charsetname
= argv
[1];
1633 output_title(charsetname
);
1635 if (!strcmp(name
,"gb2312")
1636 || !strcmp(name
,"isoir165ext") || !strcmp(name
,"gb12345ext")
1637 || !strcmp(name
,"jisx0208") || !strcmp(name
,"jisx0212"))
1639 else if (!strcmp(name
,"cns11643_1") || !strcmp(name
,"cns11643_2")
1640 || !strcmp(name
,"cns11643_3") || !strcmp(name
,"cns11643_4a")
1641 || !strcmp(name
,"cns11643_4b") || !strcmp(name
,"cns11643_5")
1642 || !strcmp(name
,"cns11643_6") || !strcmp(name
,"cns11643_7")
1643 || !strcmp(name
,"cns11643_15"))
1644 do_normal_only_charset2uni(name
);
1645 else if (!strcmp(name
,"cns11643_inv"))
1646 do_cns11643_only_uni2charset(name
);
1647 else if (!strcmp(name
,"gbkext1"))
1648 do_gbk1_only_charset2uni(name
);
1649 else if (!strcmp(name
,"gbkext2"))
1650 do_gbk2_only_charset2uni(name
);
1651 else if (!strcmp(name
,"gbkext_inv"))
1652 do_gbk1_only_uni2charset(name
);
1653 else if (!strcmp(name
,"cp936ext") || !strcmp(name
,"gb18030ext"))
1655 else if (!strcmp(name
,"ksc5601"))
1657 else if (!strcmp(name
,"uhc_1"))
1659 else if (!strcmp(name
,"uhc_2"))
1661 else if (!strcmp(name
,"big5") || !strcmp(name
,"cp950ext"))
1663 else if (!strcmp(name
,"hkscs"))
1665 else if (!strcmp(name
,"johab_hangul"))
1666 do_johab_hangul(name
);
1667 else if (!strcmp(name
,"cp932ext"))
1669 else if (!strcmp(name
,"gb18030uni"))
1670 do_gb18030uni(name
);