Tweak.
[libiconv.git] / tools / cjk_tab_to_h.c
blob595a76c131bcec0f619440df4f7d3db6c4ef4ecd
1 /* Copyright (C) 1999-2001 Free Software Foundation, Inc.
2 This file is part of the GNU LIBICONV Tools.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
7 any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software Foundation,
16 Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
19 * Generates a CJK character set table from a .TXT table as found on
20 * ftp.unicode.org or in the X nls directory.
21 * Examples:
23 * ./cjk_tab_to_h GB2312.1980-0 gb2312 > gb2312.h < gb2312
24 * ./cjk_tab_to_h JISX0208.1983-0 jisx0208 > jisx0208.h < jis0208
25 * ./cjk_tab_to_h KSC5601.1987-0 ksc5601 > ksc5601.h < ksc5601
27 * ./cjk_tab_to_h GB2312.1980-0 gb2312 > gb2312.h < GB2312.TXT
28 * ./cjk_tab_to_h JISX0208.1983-0 jisx0208 > jisx0208.h < JIS0208.TXT
29 * ./cjk_tab_to_h JISX0212.1990-0 jisx0212 > jisx0212.h < JIS0212.TXT
30 * ./cjk_tab_to_h KSC5601.1987-0 ksc5601 > ksc5601.h < KSC5601.TXT
31 * ./cjk_tab_to_h KSX1001.1992-0 ksc5601 > ksc5601.h < KSX1001.TXT
33 * ./cjk_tab_to_h BIG5 big5 > big5.h < BIG5.TXT
35 * ./cjk_tab_to_h JOHAB johab > johab.h < JOHAB.TXT
38 #include <stdio.h>
39 #include <stdlib.h>
40 #include <stdbool.h>
41 #include <string.h>
43 typedef struct {
44 int start;
45 int end;
46 } Block;
48 typedef struct {
49 int rows; /* number of possible values for the 1st byte */
50 int cols; /* number of possible values for the 2nd byte */
51 int (*row_byte) (int row); /* returns the 1st byte value for a given row */
52 int (*col_byte) (int col); /* returns the 2nd byte value for a given col */
53 int (*byte_row) (int byte); /* converts a 1st byte value to a row, else -1 */
54 int (*byte_col) (int byte); /* converts a 2nd byte value to a col, else -1 */
55 const char* check_row_expr; /* format string for 1st byte value checking */
56 const char* check_col_expr; /* format string for 2nd byte value checking */
57 const char* byte_row_expr; /* format string for 1st byte value to row */
58 const char* byte_col_expr; /* format string for 2nd byte value to col */
59 int** charset2uni; /* charset2uni[0..rows-1][0..cols-1] is valid */
60 /* You'll understand the terms "row" and "col" when you buy Ken Lunde's book.
61 Once a row is fixed, choosing a "col" is the same as choosing a "cell". */
62 int* charsetpage; /* charsetpage[0..rows]: how large is a page for a row */
63 int ncharsetblocks;
64 Block* charsetblocks; /* blocks[0..nblocks-1] */
65 int* uni2charset; /* uni2charset[0x0000..0xffff] */
66 } Encoding;
69 * Outputs the file title.
71 static void output_title (const char *charsetname)
73 printf("/*\n");
74 printf(" * Copyright (C) 1999-2001 Free Software Foundation, Inc.\n");
75 printf(" * This file is part of the GNU LIBICONV Library.\n");
76 printf(" *\n");
77 printf(" * The GNU LIBICONV Library is free software; you can redistribute it\n");
78 printf(" * and/or modify it under the terms of the GNU Library General Public\n");
79 printf(" * License as published by the Free Software Foundation; either version 2\n");
80 printf(" * of the License, or (at your option) any later version.\n");
81 printf(" *\n");
82 printf(" * The GNU LIBICONV Library is distributed in the hope that it will be\n");
83 printf(" * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
84 printf(" * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU\n");
85 printf(" * Library General Public License for more details.\n");
86 printf(" *\n");
87 printf(" * You should have received a copy of the GNU Library General Public\n");
88 printf(" * License along with the GNU LIBICONV Library; see the file COPYING.LIB.\n");
89 printf(" * If not, write to the Free Software Foundation, Inc., 59 Temple Place -\n");
90 printf(" * Suite 330, Boston, MA 02111-1307, USA.\n");
91 printf(" */\n");
92 printf("\n");
93 printf("/*\n");
94 printf(" * %s\n", charsetname);
95 printf(" */\n");
96 printf("\n");
100 * Reads the charset2uni table from standard input.
102 static void read_table (Encoding* enc)
104 int row, col, i, i1, i2, c, j;
106 enc->charset2uni = (int**) malloc(enc->rows*sizeof(int*));
107 for (row = 0; row < enc->rows; row++)
108 enc->charset2uni[row] = (int*) malloc(enc->cols*sizeof(int));
110 for (row = 0; row < enc->rows; row++)
111 for (col = 0; col < enc->cols; col++)
112 enc->charset2uni[row][col] = 0xfffd;
114 c = getc(stdin);
115 ungetc(c,stdin);
116 if (c == '#') {
117 /* Read a unicode.org style .TXT file. */
118 for (;;) {
119 c = getc(stdin);
120 if (c == EOF)
121 break;
122 if (c == '\n' || c == ' ' || c == '\t')
123 continue;
124 if (c == '#') {
125 do { c = getc(stdin); } while (!(c == EOF || c == '\n'));
126 continue;
128 ungetc(c,stdin);
129 if (scanf("0x%x", &j) != 1)
130 exit(1);
131 i1 = j >> 8;
132 i2 = j & 0xff;
133 row = enc->byte_row(i1);
134 col = enc->byte_col(i2);
135 if (row < 0 || col < 0) {
136 fprintf(stderr, "lost entry for %02x %02x\n", i1, i2);
137 exit(1);
139 if (scanf(" 0x%x", &enc->charset2uni[row][col]) != 1)
140 exit(1);
142 } else {
143 /* Read a table of hexadecimal Unicode values. */
144 for (i1 = 32; i1 < 132; i1++)
145 for (i2 = 32; i2 < 132; i2++) {
146 i = scanf("%x", &j);
147 if (i == EOF)
148 goto read_done;
149 if (i != 1)
150 exit(1);
151 if (j < 0 || j == 0xffff)
152 j = 0xfffd;
153 if (j != 0xfffd) {
154 if (enc->byte_row(i1) < 0 || enc->byte_col(i2) < 0) {
155 fprintf(stderr, "lost entry at %02x %02x\n", i1, i2);
156 exit (1);
158 enc->charset2uni[enc->byte_row(i1)][enc->byte_col(i2)] = j;
161 read_done: ;
166 * Computes the charsetpage[0..rows] array.
168 static void find_charset2uni_pages (Encoding* enc)
170 int row, col;
172 enc->charsetpage = (int*) malloc((enc->rows+1)*sizeof(int));
174 for (row = 0; row <= enc->rows; row++)
175 enc->charsetpage[row] = 0;
177 for (row = 0; row < enc->rows; row++) {
178 int used = 0;
179 for (col = 0; col < enc->cols; col++)
180 if (enc->charset2uni[row][col] != 0xfffd)
181 used = col+1;
182 enc->charsetpage[row] = used;
187 * Fills in nblocks and blocks.
189 static void find_charset2uni_blocks (Encoding* enc)
191 int n, row, lastrow;
193 enc->charsetblocks = (Block*) malloc(enc->rows*sizeof(Block));
195 n = 0;
196 for (row = 0; row < enc->rows; row++)
197 if (enc->charsetpage[row] > 0 && (row == 0 || enc->charsetpage[row-1] == 0)) {
198 for (lastrow = row; enc->charsetpage[lastrow+1] > 0; lastrow++);
199 enc->charsetblocks[n].start = row * enc->cols;
200 enc->charsetblocks[n].end = lastrow * enc->cols + enc->charsetpage[lastrow];
201 n++;
203 enc->ncharsetblocks = n;
207 * Outputs the charset to unicode table and function.
209 static void output_charset2uni (const char* name, Encoding* enc)
211 int row, col, lastrow, col_max, i, i1_min, i1_max;
213 find_charset2uni_pages(enc);
215 find_charset2uni_blocks(enc);
217 for (row = 0; row < enc->rows; row++)
218 if (enc->charsetpage[row] > 0) {
219 if (row == 0 || enc->charsetpage[row-1] == 0) {
220 /* Start a new block. */
221 for (lastrow = row; enc->charsetpage[lastrow+1] > 0; lastrow++);
222 printf("static const unsigned short %s_2uni_page%02x[%d] = {\n",
223 name, enc->row_byte(row),
224 (lastrow-row) * enc->cols + enc->charsetpage[lastrow]);
226 printf(" /""* 0x%02x *""/\n ", enc->row_byte(row));
227 col_max = (enc->charsetpage[row+1] > 0 ? enc->cols : enc->charsetpage[row]);
228 for (col = 0; col < col_max; col++) {
229 printf(" 0x%04x,", enc->charset2uni[row][col]);
230 if ((col % 8) == 7 && (col+1 < col_max)) printf("\n ");
232 printf("\n");
233 if (enc->charsetpage[row+1] == 0) {
234 /* End a block. */
235 printf("};\n");
238 printf("\n");
240 printf("static int\n");
241 printf("%s_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, int n)\n", name);
242 printf("{\n");
243 printf(" unsigned char c1 = s[0];\n");
244 printf(" if (");
245 for (i = 0; i < enc->ncharsetblocks; i++) {
246 i1_min = enc->row_byte(enc->charsetblocks[i].start / enc->cols);
247 i1_max = enc->row_byte((enc->charsetblocks[i].end-1) / enc->cols);
248 if (i > 0)
249 printf(" || ");
250 if (i1_min == i1_max)
251 printf("(c1 == 0x%02x)", i1_min);
252 else
253 printf("(c1 >= 0x%02x && c1 <= 0x%02x)", i1_min, i1_max);
255 printf(") {\n");
256 printf(" if (n >= 2) {\n");
257 printf(" unsigned char c2 = s[1];\n");
258 printf(" if (");
259 printf(enc->check_col_expr, "c2");
260 printf(") {\n");
261 printf(" unsigned int i = %d * (", enc->cols);
262 printf(enc->byte_row_expr, "c1");
263 printf(") + (");
264 printf(enc->byte_col_expr, "c2");
265 printf(");\n");
266 printf(" unsigned short wc = 0xfffd;\n");
267 for (i = 0; i < enc->ncharsetblocks; i++) {
268 printf(" ");
269 if (i > 0)
270 printf("} else ");
271 if (i < enc->ncharsetblocks-1)
272 printf("if (i < %d) ", enc->charsetblocks[i+1].start);
273 printf("{\n");
274 printf(" if (i < %d)\n", enc->charsetblocks[i].end);
275 printf(" wc = %s_2uni_page%02x[i", name, enc->row_byte(enc->charsetblocks[i].start / enc->cols));
276 if (enc->charsetblocks[i].start > 0)
277 printf("-%d", enc->charsetblocks[i].start);
278 printf("];\n");
280 printf(" }\n");
281 printf(" if (wc != 0xfffd) {\n");
282 printf(" *pwc = (ucs4_t) wc;\n");
283 printf(" return 2;\n");
284 printf(" }\n");
285 printf(" }\n");
286 printf(" return RET_ILSEQ;\n");
287 printf(" }\n");
288 printf(" return RET_TOOFEW(0);\n");
289 printf(" }\n");
290 printf(" return RET_ILSEQ;\n");
291 printf("}\n");
292 printf("\n");
296 * Outputs the charset to unicode table and function.
297 * (Suitable if the mapping function is well defined, i.e. has no holes, and
298 * is monotonically increasing with small gaps only.)
300 static void output_charset2uni_noholes_monotonic (const char* name, Encoding* enc)
302 int row, col, lastrow, r, col_max, i, i1_min, i1_max;
304 /* Choose stepsize so that stepsize*steps_per_row >= enc->cols, and
305 enc->charset2uni[row][col] - enc->charset2uni[row][col/stepsize*stepsize]
306 is always < 0x100. */
307 int steps_per_row = 2;
308 int stepsize = (enc->cols + steps_per_row-1) / steps_per_row;
310 find_charset2uni_pages(enc);
312 find_charset2uni_blocks(enc);
314 for (row = 0; row < enc->rows; row++)
315 if (enc->charsetpage[row] > 0) {
316 if (row == 0 || enc->charsetpage[row-1] == 0) {
317 /* Start a new block. */
318 for (lastrow = row; enc->charsetpage[lastrow+1] > 0; lastrow++);
319 printf("static const unsigned short %s_2uni_main_page%02x[%d] = {\n ",
320 name, enc->row_byte(row),
321 steps_per_row*(lastrow-row+1));
322 for (r = row; r <= lastrow; r++) {
323 for (i = 0; i < steps_per_row; i++)
324 printf(" 0x%04x,", enc->charset2uni[r][i*stepsize]);
325 if (((r-row) % 4) == 3 && (r < lastrow)) printf("\n ");
327 printf("\n");
328 printf("};\n");
329 printf("static const unsigned char %s_2uni_page%02x[%d] = {\n",
330 name, enc->row_byte(row),
331 (lastrow-row) * enc->cols + enc->charsetpage[lastrow]);
333 printf(" /""* 0x%02x *""/\n ", enc->row_byte(row));
334 col_max = (enc->charsetpage[row+1] > 0 ? enc->cols : enc->charsetpage[row]);
335 for (col = 0; col < col_max; col++) {
336 printf(" 0x%02x,", enc->charset2uni[row][col] - enc->charset2uni[row][col/stepsize*stepsize]);
337 if ((col % 8) == 7 && (col+1 < col_max)) printf("\n ");
339 printf("\n");
340 if (enc->charsetpage[row+1] == 0) {
341 /* End a block. */
342 printf("};\n");
345 printf("\n");
347 printf("static int\n");
348 printf("%s_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, int n)\n", name);
349 printf("{\n");
350 printf(" unsigned char c1 = s[0];\n");
351 printf(" if (");
352 for (i = 0; i < enc->ncharsetblocks; i++) {
353 i1_min = enc->row_byte(enc->charsetblocks[i].start / enc->cols);
354 i1_max = enc->row_byte((enc->charsetblocks[i].end-1) / enc->cols);
355 if (i > 0)
356 printf(" || ");
357 if (i1_min == i1_max)
358 printf("(c1 == 0x%02x)", i1_min);
359 else
360 printf("(c1 >= 0x%02x && c1 <= 0x%02x)", i1_min, i1_max);
362 printf(") {\n");
363 printf(" if (n >= 2) {\n");
364 printf(" unsigned char c2 = s[1];\n");
365 printf(" if (");
366 printf(enc->check_col_expr, "c2");
367 printf(") {\n");
368 printf(" unsigned int row = ");
369 printf(enc->byte_row_expr, "c1");
370 printf(";\n");
371 printf(" unsigned int col = ");
372 printf(enc->byte_col_expr, "c2");
373 printf(";\n");
374 printf(" unsigned int i = %d * row + col;\n", enc->cols);
375 printf(" unsigned short wc = 0xfffd;\n");
376 for (i = 0; i < enc->ncharsetblocks; i++) {
377 printf(" ");
378 if (i > 0)
379 printf("} else ");
380 if (i < enc->ncharsetblocks-1)
381 printf("if (i < %d) ", enc->charsetblocks[i+1].start);
382 printf("{\n");
383 printf(" if (i < %d)\n", enc->charsetblocks[i].end);
384 printf(" wc = %s_2uni_main_page%02x[%d*", name, enc->row_byte(enc->charsetblocks[i].start / enc->cols), steps_per_row);
385 if (enc->charsetblocks[i].start > 0)
386 printf("(row-%d)", enc->charsetblocks[i].start / enc->cols);
387 else
388 printf("row");
389 printf("+");
390 if (steps_per_row == 2)
391 printf("(col>=%d?1:0)", stepsize);
392 else
393 printf("col/%d", stepsize);
394 printf("] + %s_2uni_page%02x[i", name, enc->row_byte(enc->charsetblocks[i].start / enc->cols));
395 if (enc->charsetblocks[i].start > 0)
396 printf("-%d", enc->charsetblocks[i].start);
397 printf("];\n");
399 printf(" }\n");
400 printf(" if (wc != 0xfffd) {\n");
401 printf(" *pwc = (ucs4_t) wc;\n");
402 printf(" return 2;\n");
403 printf(" }\n");
404 printf(" }\n");
405 printf(" return RET_ILSEQ;\n");
406 printf(" }\n");
407 printf(" return RET_TOOFEW(0);\n");
408 printf(" }\n");
409 printf(" return RET_ILSEQ;\n");
410 printf("}\n");
411 printf("\n");
415 * Computes the uni2charset[0x0000..0xffff] array.
417 static void invert (Encoding* enc)
419 int row, col, j;
421 enc->uni2charset = (int*) malloc(0x10000*sizeof(int));
423 for (j = 0; j < 0x10000; j++)
424 enc->uni2charset[j] = 0;
426 for (row = 0; row < enc->rows; row++)
427 for (col = 0; col < enc->cols; col++) {
428 j = enc->charset2uni[row][col];
429 if (j != 0xfffd)
430 enc->uni2charset[j] = 0x100 * enc->row_byte(row) + enc->col_byte(col);
435 * Outputs the unicode to charset table and function, using a linear array.
436 * (Suitable if the table is dense.)
438 static void output_uni2charset_dense (const char* name, Encoding* enc)
440 /* Like in 8bit_tab_to_h.c */
441 bool pages[0x100];
442 int line[0x2000];
443 int tableno;
444 struct { int minline; int maxline; int usecount; } tables[0x2000];
445 bool first;
446 int row, col, j, p, j1, j2, t;
448 for (p = 0; p < 0x100; p++)
449 pages[p] = false;
450 for (row = 0; row < enc->rows; row++)
451 for (col = 0; col < enc->cols; col++) {
452 j = enc->charset2uni[row][col];
453 if (j != 0xfffd)
454 pages[j>>8] = true;
456 for (j1 = 0; j1 < 0x2000; j1++) {
457 bool all_invalid = true;
458 for (j2 = 0; j2 < 8; j2++) {
459 j = 8*j1+j2;
460 if (enc->uni2charset[j] != 0)
461 all_invalid = false;
463 if (all_invalid)
464 line[j1] = -1;
465 else
466 line[j1] = 0;
468 tableno = 0;
469 for (j1 = 0; j1 < 0x2000; j1++) {
470 if (line[j1] >= 0) {
471 if (tableno > 0
472 && ((j1 > 0 && line[j1-1] == tableno-1)
473 || ((tables[tableno-1].maxline >> 5) == (j1 >> 5)
474 && j1 - tables[tableno-1].maxline <= 8))) {
475 line[j1] = tableno-1;
476 tables[tableno-1].maxline = j1;
477 } else {
478 tableno++;
479 line[j1] = tableno-1;
480 tables[tableno-1].minline = tables[tableno-1].maxline = j1;
484 for (t = 0; t < tableno; t++) {
485 tables[t].usecount = 0;
486 j1 = 8*tables[t].minline;
487 j2 = 8*(tables[t].maxline+1);
488 for (j = j1; j < j2; j++)
489 if (enc->uni2charset[j] != 0)
490 tables[t].usecount++;
493 p = -1;
494 for (t = 0; t < tableno; t++)
495 if (tables[t].usecount > 1) {
496 p = tables[t].minline >> 5;
497 printf("static const unsigned short %s_page%02x[%d] = {\n", name, p, 8*(tables[t].maxline-tables[t].minline+1));
498 for (j1 = tables[t].minline; j1 <= tables[t].maxline; j1++) {
499 if ((j1 % 0x20) == 0 && j1 > tables[t].minline)
500 printf(" /* 0x%04x */\n", 8*j1);
501 printf(" ");
502 for (j2 = 0; j2 < 8; j2++) {
503 j = 8*j1+j2;
504 printf(" 0x%04x,", enc->uni2charset[j]);
506 printf(" /*0x%02x-0x%02x*/\n", 8*(j1 % 0x20), 8*(j1 % 0x20)+7);
508 printf("};\n");
510 if (p >= 0)
511 printf("\n");
513 printf("static int\n%s_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, int n)\n", name);
514 printf("{\n");
515 printf(" if (n >= 2) {\n");
516 printf(" unsigned short c = 0;\n");
517 first = true;
518 for (j1 = 0; j1 < 0x2000;) {
519 t = line[j1];
520 for (j2 = j1; j2 < 0x2000 && line[j2] == t; j2++);
521 if (t >= 0) {
522 if (j1 != tables[t].minline) abort();
523 if (j2 > tables[t].maxline+1) abort();
524 j2 = tables[t].maxline+1;
525 if (first)
526 printf(" ");
527 else
528 printf(" else ");
529 first = false;
530 if (tables[t].usecount == 0) abort();
531 if (tables[t].usecount == 1) {
532 if (j2 != j1+1) abort();
533 for (j = 8*j1; j < 8*j2; j++)
534 if (enc->uni2charset[j] != 0) {
535 printf("if (wc == 0x%04x)\n c = 0x%02x;\n", j, enc->uni2charset[j]);
536 break;
538 } else {
539 if (j1 == 0) {
540 printf("if (wc < 0x%04x)", 8*j2);
541 } else {
542 printf("if (wc >= 0x%04x && wc < 0x%04x)", 8*j1, 8*j2);
544 printf("\n c = %s_page%02x[wc", name, j1 >> 5);
545 if (tables[t].minline > 0)
546 printf("-0x%04x", 8*j1);
547 printf("];\n");
550 j1 = j2;
552 printf(" if (c != 0) {\n");
553 printf(" r[0] = (c >> 8); r[1] = (c & 0xff);\n");
554 printf(" return 2;\n");
555 printf(" }\n");
556 printf(" return RET_ILSEQ;\n");
557 printf(" }\n");
558 printf(" return RET_TOOSMALL;\n");
559 printf("}\n");
563 * Outputs the unicode to charset table and function, using a packed array.
564 * (Suitable if the table is sparse.)
565 * The argument 'monotonic' may be set to true if the mapping is monotonically
566 * increasing with small gaps only.
568 static void output_uni2charset_sparse (const char* name, Encoding* enc, bool monotonic)
570 bool pages[0x100];
571 Block pageblocks[0x100]; int npageblocks;
572 int indx2charset[0x10000];
573 int summary_indx[0x1000];
574 int summary_used[0x1000];
575 int i, row, col, j, p, j1, j2, indx;
576 /* for monotonic: */
577 int log2_stepsize = (!strcmp(name,"uhc_2") ? 6 : 7);
578 int stepsize = 1 << log2_stepsize;
579 int indxsteps;
581 /* Fill pages[0x100]. */
582 for (p = 0; p < 0x100; p++)
583 pages[p] = false;
584 for (row = 0; row < enc->rows; row++)
585 for (col = 0; col < enc->cols; col++) {
586 j = enc->charset2uni[row][col];
587 if (j != 0xfffd)
588 pages[j>>8] = true;
591 #if 0
592 for (p = 0; p < 0x100; p++)
593 if (pages[p]) {
594 printf("static const unsigned short %s_page%02x[256] = {\n", name, p);
595 for (j1 = 0; j1 < 32; j1++) {
596 printf(" ");
597 for (j2 = 0; j2 < 8; j2++)
598 printf("0x%04x, ", enc->uni2charset[256*p+8*j1+j2]);
599 printf("/""*0x%02x-0x%02x*""/\n", 8*j1, 8*j1+7);
601 printf("};\n");
603 printf("\n");
604 #endif
606 /* Fill summary_indx[] and summary_used[]. */
607 indx = 0;
608 for (j1 = 0; j1 < 0x1000; j1++) {
609 summary_indx[j1] = indx;
610 summary_used[j1] = 0;
611 for (j2 = 0; j2 < 16; j2++) {
612 j = 16*j1+j2;
613 if (enc->uni2charset[j] != 0) {
614 indx2charset[indx++] = enc->uni2charset[j];
615 summary_used[j1] |= (1 << j2);
620 /* Fill npageblocks and pageblocks[]. */
621 npageblocks = 0;
622 for (p = 0; p < 0x100; ) {
623 if (pages[p] && (p == 0 || !pages[p-1])) {
624 pageblocks[npageblocks].start = 16*p;
625 do p++; while (p < 0x100 && pages[p]);
626 j1 = 16*p;
627 while (summary_used[j1-1] == 0) j1--;
628 pageblocks[npageblocks].end = j1;
629 npageblocks++;
630 } else
631 p++;
634 if (monotonic) {
635 indxsteps = (indx + stepsize-1) / stepsize;
636 printf("static const unsigned short %s_2charset_main[%d] = {\n", name, indxsteps);
637 for (i = 0; i < indxsteps; ) {
638 if ((i % 8) == 0) printf(" ");
639 printf(" 0x%04x,", indx2charset[i*stepsize]);
640 i++;
641 if ((i % 8) == 0 || i == indxsteps) printf("\n");
643 printf("};\n");
644 printf("static const unsigned char %s_2charset[%d] = {\n", name, indx);
645 for (i = 0; i < indx; ) {
646 if ((i % 8) == 0) printf(" ");
647 printf(" 0x%02x,", indx2charset[i] - indx2charset[i/stepsize*stepsize]);
648 i++;
649 if ((i % 8) == 0 || i == indx) printf("\n");
651 printf("};\n");
652 } else {
653 printf("static const unsigned short %s_2charset[%d] = {\n", name, indx);
654 for (i = 0; i < indx; ) {
655 if ((i % 8) == 0) printf(" ");
656 printf(" 0x%04x,", indx2charset[i]);
657 i++;
658 if ((i % 8) == 0 || i == indx) printf("\n");
660 printf("};\n");
662 printf("\n");
663 for (i = 0; i < npageblocks; i++) {
664 printf("static const Summary16 %s_uni2indx_page%02x[%d] = {\n", name,
665 pageblocks[i].start/16, pageblocks[i].end-pageblocks[i].start);
666 for (j1 = pageblocks[i].start; j1 < pageblocks[i].end; ) {
667 if (((16*j1) % 0x100) == 0) printf(" /""* 0x%04x *""/\n", 16*j1);
668 if ((j1 % 4) == 0) printf(" ");
669 printf(" { %4d, 0x%04x },", summary_indx[j1], summary_used[j1]);
670 j1++;
671 if ((j1 % 4) == 0 || j1 == pageblocks[i].end) printf("\n");
673 printf("};\n");
675 printf("\n");
677 printf("static int\n");
678 printf("%s_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, int n)\n", name);
679 printf("{\n");
680 printf(" if (n >= 2) {\n");
681 printf(" const Summary16 *summary = NULL;\n");
682 for (i = 0; i < npageblocks; i++) {
683 printf(" ");
684 if (i > 0)
685 printf("else ");
686 printf("if (wc >= 0x%04x && wc < 0x%04x)\n",
687 16*pageblocks[i].start, 16*pageblocks[i].end);
688 printf(" summary = &%s_uni2indx_page%02x[(wc>>4)", name,
689 pageblocks[i].start/16);
690 if (pageblocks[i].start > 0)
691 printf("-0x%03x", pageblocks[i].start);
692 printf("];\n");
694 printf(" if (summary) {\n");
695 printf(" unsigned short used = summary->used;\n");
696 printf(" unsigned int i = wc & 0x0f;\n");
697 printf(" if (used & ((unsigned short) 1 << i)) {\n");
698 printf(" unsigned short c;\n");
699 printf(" /* Keep in `used' only the bits 0..i-1. */\n");
700 printf(" used &= ((unsigned short) 1 << i) - 1;\n");
701 printf(" /* Add `summary->indx' and the number of bits set in `used'. */\n");
702 printf(" used = (used & 0x5555) + ((used & 0xaaaa) >> 1);\n");
703 printf(" used = (used & 0x3333) + ((used & 0xcccc) >> 2);\n");
704 printf(" used = (used & 0x0f0f) + ((used & 0xf0f0) >> 4);\n");
705 printf(" used = (used & 0x00ff) + (used >> 8);\n");
706 if (monotonic) {
707 printf(" used += summary->indx;\n");
708 printf(" c = %s_2charset_main[used>>%d] + %s_2charset[used];\n", name, log2_stepsize, name);
709 } else
710 printf(" c = %s_2charset[summary->indx + used];\n", name);
711 printf(" r[0] = (c >> 8); r[1] = (c & 0xff);\n");
712 printf(" return 2;\n");
713 printf(" }\n");
714 printf(" }\n");
715 printf(" return RET_ILSEQ;\n");
716 printf(" }\n");
717 printf(" return RET_TOOSMALL;\n");
718 printf("}\n");
721 /* ISO-2022/EUC specifics */
723 static int row_byte_normal (int row) { return 0x21+row; }
724 static int col_byte_normal (int col) { return 0x21+col; }
725 static int byte_row_normal (int byte) { return byte-0x21; }
726 static int byte_col_normal (int byte) { return byte-0x21; }
728 static void do_normal (const char* name)
730 Encoding enc;
732 enc.rows = 94;
733 enc.cols = 94;
734 enc.row_byte = row_byte_normal;
735 enc.col_byte = col_byte_normal;
736 enc.byte_row = byte_row_normal;
737 enc.byte_col = byte_col_normal;
738 enc.check_row_expr = "%1$s >= 0x21 && %1$s < 0x7f";
739 enc.check_col_expr = "%1$s >= 0x21 && %1$s < 0x7f";
740 enc.byte_row_expr = "%1$s - 0x21";
741 enc.byte_col_expr = "%1$s - 0x21";
743 read_table(&enc);
744 output_charset2uni(name,&enc);
745 invert(&enc); output_uni2charset_sparse(name,&enc,false);
748 /* Note: On first sight, the jisx0212_2charset[] table seems to be in order,
749 starting from the charset=0x3021/uni=0x4e02 pair. But it's only mostly in
750 order. There are 75 out-of-order values, scattered all throughout the table.
753 static void do_normal_only_charset2uni (const char* name)
755 Encoding enc;
757 enc.rows = 94;
758 enc.cols = 94;
759 enc.row_byte = row_byte_normal;
760 enc.col_byte = col_byte_normal;
761 enc.byte_row = byte_row_normal;
762 enc.byte_col = byte_col_normal;
763 enc.check_row_expr = "%1$s >= 0x21 && %1$s < 0x7f";
764 enc.check_col_expr = "%1$s >= 0x21 && %1$s < 0x7f";
765 enc.byte_row_expr = "%1$s - 0x21";
766 enc.byte_col_expr = "%1$s - 0x21";
768 read_table(&enc);
769 output_charset2uni(name,&enc);
772 /* CNS 11643 specifics - trick to put two tables into one */
774 static int row_byte_cns11643 (int row) {
775 return 0x100 * (row / 94) + (row % 94) + 0x21;
777 static int byte_row_cns11643 (int byte) {
778 return (byte >= 0x100 && byte < 0x200 ? byte-0x121 :
779 byte >= 0x200 && byte < 0x300 ? byte-0x221+94 :
780 byte >= 0x300 && byte < 0x400 ? byte-0x321+2*94 :
781 -1);
784 static void do_cns11643_only_uni2charset (const char* name)
786 Encoding enc;
787 int j, x;
789 enc.rows = 3*94;
790 enc.cols = 94;
791 enc.row_byte = row_byte_cns11643;
792 enc.col_byte = col_byte_normal;
793 enc.byte_row = byte_row_cns11643;
794 enc.byte_col = byte_col_normal;
795 enc.check_row_expr = "%1$s >= 0x21 && %1$s < 0x7f";
796 enc.check_col_expr = "%1$s >= 0x21 && %1$s < 0x7f";
797 enc.byte_row_expr = "%1$s - 0x21";
798 enc.byte_col_expr = "%1$s - 0x21";
800 read_table(&enc);
801 invert(&enc);
802 /* Move the 2 plane bits into the unused bits 15 and 7. */
803 for (j = 0; j < 0x10000; j++) {
804 x = enc.uni2charset[j];
805 if (x != 0) {
806 if (x & 0x8080) abort();
807 switch (x >> 16) {
808 case 0: /* plane 1 */ x = (x & 0xffff) | 0x0000; break;
809 case 1: /* plane 2 */ x = (x & 0xffff) | 0x0080; break;
810 case 2: /* plane 3 */ x = (x & 0xffff) | 0x8000; break;
811 default: abort();
813 enc.uni2charset[j] = x;
816 output_uni2charset_sparse(name,&enc,false);
819 /* GBK specifics */
821 static int row_byte_gbk1 (int row) {
822 return 0x81+row;
824 static int col_byte_gbk1 (int col) {
825 return (col >= 0x3f ? 0x41 : 0x40) + col;
827 static int byte_row_gbk1 (int byte) {
828 if (byte >= 0x81 && byte < 0xff)
829 return byte-0x81;
830 else
831 return -1;
833 static int byte_col_gbk1 (int byte) {
834 if (byte >= 0x40 && byte < 0x7f)
835 return byte-0x40;
836 else if (byte >= 0x80 && byte < 0xff)
837 return byte-0x41;
838 else
839 return -1;
842 static void do_gbk1 (const char* name)
844 Encoding enc;
846 enc.rows = 126;
847 enc.cols = 190;
848 enc.row_byte = row_byte_gbk1;
849 enc.col_byte = col_byte_gbk1;
850 enc.byte_row = byte_row_gbk1;
851 enc.byte_col = byte_col_gbk1;
852 enc.check_row_expr = "%1$s >= 0x81 && %1$s < 0xff";
853 enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0x80 && %1$s < 0xff)";
854 enc.byte_row_expr = "%1$s - 0x81";
855 enc.byte_col_expr = "%1$s - (%1$s >= 0x80 ? 0x41 : 0x40)";
857 read_table(&enc);
858 output_charset2uni(name,&enc);
859 invert(&enc); output_uni2charset_dense(name,&enc);
862 static void do_gbk1_only_charset2uni (const char* name)
864 Encoding enc;
866 enc.rows = 126;
867 enc.cols = 190;
868 enc.row_byte = row_byte_gbk1;
869 enc.col_byte = col_byte_gbk1;
870 enc.byte_row = byte_row_gbk1;
871 enc.byte_col = byte_col_gbk1;
872 enc.check_row_expr = "%1$s >= 0x81 && %1$s < 0xff";
873 enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0x80 && %1$s < 0xff)";
874 enc.byte_row_expr = "%1$s - 0x81";
875 enc.byte_col_expr = "%1$s - (%1$s >= 0x80 ? 0x41 : 0x40)";
877 read_table(&enc);
878 output_charset2uni(name,&enc);
881 static int row_byte_gbk2 (int row) {
882 return 0x81+row;
884 static int col_byte_gbk2 (int col) {
885 return (col >= 0x3f ? 0x41 : 0x40) + col;
887 static int byte_row_gbk2 (int byte) {
888 if (byte >= 0x81 && byte < 0xff)
889 return byte-0x81;
890 else
891 return -1;
893 static int byte_col_gbk2 (int byte) {
894 if (byte >= 0x40 && byte < 0x7f)
895 return byte-0x40;
896 else if (byte >= 0x80 && byte < 0xa1)
897 return byte-0x41;
898 else
899 return -1;
902 static void do_gbk2_only_charset2uni (const char* name)
904 Encoding enc;
906 enc.rows = 126;
907 enc.cols = 96;
908 enc.row_byte = row_byte_gbk2;
909 enc.col_byte = col_byte_gbk2;
910 enc.byte_row = byte_row_gbk2;
911 enc.byte_col = byte_col_gbk2;
912 enc.check_row_expr = "%1$s >= 0x81 && %1$s < 0xff";
913 enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0x80 && %1$s < 0xa1)";
914 enc.byte_row_expr = "%1$s - 0x81";
915 enc.byte_col_expr = "%1$s - (%1$s >= 0x80 ? 0x41 : 0x40)";
917 read_table(&enc);
918 output_charset2uni(name,&enc);
921 static void do_gbk1_only_uni2charset (const char* name)
923 Encoding enc;
925 enc.rows = 126;
926 enc.cols = 190;
927 enc.row_byte = row_byte_gbk1;
928 enc.col_byte = col_byte_gbk1;
929 enc.byte_row = byte_row_gbk1;
930 enc.byte_col = byte_col_gbk1;
931 enc.check_row_expr = "%1$s >= 0x81 && %1$s < 0xff";
932 enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0x80 && %1$s < 0xff)";
933 enc.byte_row_expr = "%1$s - 0x81";
934 enc.byte_col_expr = "%1$s - (%1$s >= 0x80 ? 0x41 : 0x40)";
936 read_table(&enc);
937 invert(&enc); output_uni2charset_sparse(name,&enc,false);
940 /* KSC 5601 specifics */
943 * Reads the charset2uni table from standard input.
945 static void read_table_ksc5601 (Encoding* enc)
947 int row, col, i, i1, i2, c, j;
949 enc->charset2uni = (int**) malloc(enc->rows*sizeof(int*));
950 for (row = 0; row < enc->rows; row++)
951 enc->charset2uni[row] = (int*) malloc(enc->cols*sizeof(int));
953 for (row = 0; row < enc->rows; row++)
954 for (col = 0; col < enc->cols; col++)
955 enc->charset2uni[row][col] = 0xfffd;
957 c = getc(stdin);
958 ungetc(c,stdin);
959 if (c == '#') {
960 /* Read a unicode.org style .TXT file. */
961 for (;;) {
962 c = getc(stdin);
963 if (c == EOF)
964 break;
965 if (c == '\n' || c == ' ' || c == '\t')
966 continue;
967 if (c == '#') {
968 do { c = getc(stdin); } while (!(c == EOF || c == '\n'));
969 continue;
971 ungetc(c,stdin);
972 if (scanf("0x%x", &j) != 1)
973 exit(1);
974 i1 = j >> 8;
975 i2 = j & 0xff;
976 if (scanf(" 0x%x", &j) != 1)
977 exit(1);
978 /* Take only the range covered by KS C 5601.1987-0 = KS C 5601.1989-0
979 = KS X 1001.1992, ignore the rest. */
980 if (!(i1 >= 128+33 && i1 < 128+127 && i2 >= 128+33 && i2 < 128+127))
981 continue; /* KSC5601 specific */
982 i1 &= 0x7f; /* KSC5601 specific */
983 i2 &= 0x7f; /* KSC5601 specific */
984 row = enc->byte_row(i1);
985 col = enc->byte_col(i2);
986 if (row < 0 || col < 0) {
987 fprintf(stderr, "lost entry for %02x %02x\n", i1, i2);
988 exit(1);
990 enc->charset2uni[row][col] = j;
992 } else {
993 /* Read a table of hexadecimal Unicode values. */
994 for (i1 = 33; i1 < 127; i1++)
995 for (i2 = 33; i2 < 127; i2++) {
996 i = scanf("%x", &j);
997 if (i == EOF)
998 goto read_done;
999 if (i != 1)
1000 exit(1);
1001 if (j < 0 || j == 0xffff)
1002 j = 0xfffd;
1003 if (j != 0xfffd) {
1004 if (enc->byte_row(i1) < 0 || enc->byte_col(i2) < 0) {
1005 fprintf(stderr, "lost entry at %02x %02x\n", i1, i2);
1006 exit (1);
1008 enc->charset2uni[enc->byte_row(i1)][enc->byte_col(i2)] = j;
1011 read_done: ;
1015 static void do_ksc5601 (const char* name)
1017 Encoding enc;
1019 enc.rows = 94;
1020 enc.cols = 94;
1021 enc.row_byte = row_byte_normal;
1022 enc.col_byte = col_byte_normal;
1023 enc.byte_row = byte_row_normal;
1024 enc.byte_col = byte_col_normal;
1025 enc.check_row_expr = "%1$s >= 0x21 && %1$s < 0x7f";
1026 enc.check_col_expr = "%1$s >= 0x21 && %1$s < 0x7f";
1027 enc.byte_row_expr = "%1$s - 0x21";
1028 enc.byte_col_expr = "%1$s - 0x21";
1030 read_table_ksc5601(&enc);
1031 output_charset2uni(name,&enc);
1032 invert(&enc); output_uni2charset_sparse(name,&enc,false);
1035 /* UHC specifics */
1037 /* UHC part 1: 0x{81..A0}{41..5A,61..7A,81..FE} */
1039 static int row_byte_uhc_1 (int row) {
1040 return 0x81 + row;
1042 static int col_byte_uhc_1 (int col) {
1043 return (col >= 0x34 ? 0x4d : col >= 0x1a ? 0x47 : 0x41) + col;
1045 static int byte_row_uhc_1 (int byte) {
1046 if (byte >= 0x81 && byte < 0xa1)
1047 return byte-0x81;
1048 else
1049 return -1;
1051 static int byte_col_uhc_1 (int byte) {
1052 if (byte >= 0x41 && byte < 0x5b)
1053 return byte-0x41;
1054 else if (byte >= 0x61 && byte < 0x7b)
1055 return byte-0x47;
1056 else if (byte >= 0x81 && byte < 0xff)
1057 return byte-0x4d;
1058 else
1059 return -1;
1062 static void do_uhc_1 (const char* name)
1064 Encoding enc;
1066 enc.rows = 32;
1067 enc.cols = 178;
1068 enc.row_byte = row_byte_uhc_1;
1069 enc.col_byte = col_byte_uhc_1;
1070 enc.byte_row = byte_row_uhc_1;
1071 enc.byte_col = byte_col_uhc_1;
1072 enc.check_row_expr = "(%1$s >= 0x81 && %1$s < 0xa1)";
1073 enc.check_col_expr = "(%1$s >= 0x41 && %1$s < 0x5b) || (%1$s >= 0x61 && %1$s < 0x7b) || (%1$s >= 0x81 && %1$s < 0xff)";
1074 enc.byte_row_expr = "%1$s - 0x81";
1075 enc.byte_col_expr = "%1$s - (%1$s >= 0x81 ? 0x4d : %1$s >= 0x61 ? 0x47 : 0x41)";
1077 read_table(&enc);
1078 output_charset2uni_noholes_monotonic(name,&enc);
1079 invert(&enc); output_uni2charset_sparse(name,&enc,true);
1082 /* UHC part 2: 0x{A1..C6}{41..5A,61..7A,81..A0} */
1084 static int row_byte_uhc_2 (int row) {
1085 return 0xa1 + row;
1087 static int col_byte_uhc_2 (int col) {
1088 return (col >= 0x34 ? 0x4d : col >= 0x1a ? 0x47 : 0x41) + col;
1090 static int byte_row_uhc_2 (int byte) {
1091 if (byte >= 0xa1 && byte < 0xff)
1092 return byte-0xa1;
1093 else
1094 return -1;
1096 static int byte_col_uhc_2 (int byte) {
1097 if (byte >= 0x41 && byte < 0x5b)
1098 return byte-0x41;
1099 else if (byte >= 0x61 && byte < 0x7b)
1100 return byte-0x47;
1101 else if (byte >= 0x81 && byte < 0xa1)
1102 return byte-0x4d;
1103 else
1104 return -1;
1107 static void do_uhc_2 (const char* name)
1109 Encoding enc;
1111 enc.rows = 94;
1112 enc.cols = 84;
1113 enc.row_byte = row_byte_uhc_2;
1114 enc.col_byte = col_byte_uhc_2;
1115 enc.byte_row = byte_row_uhc_2;
1116 enc.byte_col = byte_col_uhc_2;
1117 enc.check_row_expr = "(%1$s >= 0xa1 && %1$s < 0xff)";
1118 enc.check_col_expr = "(%1$s >= 0x41 && %1$s < 0x5b) || (%1$s >= 0x61 && %1$s < 0x7b) || (%1$s >= 0x81 && %1$s < 0xa1)";
1119 enc.byte_row_expr = "%1$s - 0xa1";
1120 enc.byte_col_expr = "%1$s - (%1$s >= 0x81 ? 0x4d : %1$s >= 0x61 ? 0x47 : 0x41)";
1122 read_table(&enc);
1123 output_charset2uni_noholes_monotonic(name,&enc);
1124 invert(&enc); output_uni2charset_sparse(name,&enc,true);
1127 /* Big5 specifics */
1129 static int row_byte_big5 (int row) {
1130 return 0xa1+row;
1132 static int col_byte_big5 (int col) {
1133 return (col >= 0x3f ? 0x62 : 0x40) + col;
1135 static int byte_row_big5 (int byte) {
1136 if (byte >= 0xa1 && byte < 0xff)
1137 return byte-0xa1;
1138 else
1139 return -1;
1141 static int byte_col_big5 (int byte) {
1142 if (byte >= 0x40 && byte < 0x7f)
1143 return byte-0x40;
1144 else if (byte >= 0xa1 && byte < 0xff)
1145 return byte-0x62;
1146 else
1147 return -1;
1150 static void do_big5 (const char* name)
1152 Encoding enc;
1154 enc.rows = 94;
1155 enc.cols = 157;
1156 enc.row_byte = row_byte_big5;
1157 enc.col_byte = col_byte_big5;
1158 enc.byte_row = byte_row_big5;
1159 enc.byte_col = byte_col_big5;
1160 enc.check_row_expr = "%1$s >= 0xa1 && %1$s < 0xff";
1161 enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0xa1 && %1$s < 0xff)";
1162 enc.byte_row_expr = "%1$s - 0xa1";
1163 enc.byte_col_expr = "%1$s - (%1$s >= 0xa1 ? 0x62 : 0x40)";
1165 read_table(&enc);
1166 output_charset2uni(name,&enc);
1167 invert(&enc); output_uni2charset_sparse(name,&enc,false);
1170 /* HKSCS specifics */
1172 static int row_byte_hkscs (int row) {
1173 return 0x80+row;
1175 static int byte_row_hkscs (int byte) {
1176 if (byte >= 0x80 && byte < 0xff)
1177 return byte-0x80;
1178 else
1179 return -1;
1182 static void do_hkscs (const char* name)
1184 Encoding enc;
1186 enc.rows = 128;
1187 enc.cols = 157;
1188 enc.row_byte = row_byte_hkscs;
1189 enc.col_byte = col_byte_big5;
1190 enc.byte_row = byte_row_hkscs;
1191 enc.byte_col = byte_col_big5;
1192 enc.check_row_expr = "%1$s >= 0x80 && %1$s < 0xff";
1193 enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0xa1 && %1$s < 0xff)";
1194 enc.byte_row_expr = "%1$s - 0x80";
1195 enc.byte_col_expr = "%1$s - (%1$s >= 0xa1 ? 0x62 : 0x40)";
1197 read_table(&enc);
1198 output_charset2uni(name,&enc);
1199 invert(&enc); output_uni2charset_sparse(name,&enc,false);
1202 /* Johab Hangul specifics */
1204 static int row_byte_johab_hangul (int row) {
1205 return 0x84+row;
1207 static int col_byte_johab_hangul (int col) {
1208 return (col >= 0x3e ? 0x43 : 0x41) + col;
1210 static int byte_row_johab_hangul (int byte) {
1211 if (byte >= 0x84 && byte < 0xd4)
1212 return byte-0x84;
1213 else
1214 return -1;
1216 static int byte_col_johab_hangul (int byte) {
1217 if (byte >= 0x41 && byte < 0x7f)
1218 return byte-0x41;
1219 else if (byte >= 0x81 && byte < 0xff)
1220 return byte-0x43;
1221 else
1222 return -1;
1225 static void do_johab_hangul (const char* name)
1227 Encoding enc;
1229 enc.rows = 80;
1230 enc.cols = 188;
1231 enc.row_byte = row_byte_johab_hangul;
1232 enc.col_byte = col_byte_johab_hangul;
1233 enc.byte_row = byte_row_johab_hangul;
1234 enc.byte_col = byte_col_johab_hangul;
1235 enc.check_row_expr = "%1$s >= 0x84 && %1$s < 0xd4";
1236 enc.check_col_expr = "(%1$s >= 0x41 && %1$s < 0x7f) || (%1$s >= 0x81 && %1$s < 0xff)";
1237 enc.byte_row_expr = "%1$s - 0x84";
1238 enc.byte_col_expr = "%1$s - (%1$s >= 0x81 ? 0x43 : 0x41)";
1240 read_table(&enc);
1241 output_charset2uni(name,&enc);
1242 invert(&enc); output_uni2charset_dense(name,&enc);
1245 /* SJIS specifics */
1247 static int row_byte_sjis (int row) {
1248 return (row >= 0x1f ? 0xc1 : 0x81) + row;
1250 static int col_byte_sjis (int col) {
1251 return (col >= 0x3f ? 0x41 : 0x40) + col;
1253 static int byte_row_sjis (int byte) {
1254 if (byte >= 0x81 && byte < 0xa0)
1255 return byte-0x81;
1256 else if (byte >= 0xe0)
1257 return byte-0xc1;
1258 else
1259 return -1;
1261 static int byte_col_sjis (int byte) {
1262 if (byte >= 0x40 && byte < 0x7f)
1263 return byte-0x40;
1264 else if (byte >= 0x80 && byte < 0xfd)
1265 return byte-0x41;
1266 else
1267 return -1;
1270 static void do_sjis (const char* name)
1272 Encoding enc;
1274 enc.rows = 94;
1275 enc.cols = 188;
1276 enc.row_byte = row_byte_sjis;
1277 enc.col_byte = col_byte_sjis;
1278 enc.byte_row = byte_row_sjis;
1279 enc.byte_col = byte_col_sjis;
1280 enc.check_row_expr = "(%1$s >= 0x81 && %1$s < 0xa0) || (%1$s >= 0xe0)";
1281 enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0x80 && %1$s < 0xfd)";
1282 enc.byte_row_expr = "%1$s - (%1$s >= 0xe0 ? 0xc1 : 0x81)";
1283 enc.byte_col_expr = "%1$s - (%1$s >= 0x80 ? 0x41 : 0x40)";
1285 read_table(&enc);
1286 output_charset2uni(name,&enc);
1287 invert(&enc); output_uni2charset_sparse(name,&enc,false);
1290 /* GB18030 Unicode specifics */
1292 static void do_gb18030uni (const char* name)
1294 int c;
1295 unsigned int bytes;
1296 int i1, i2, i3, i4, i, j, k;
1297 int charset2uni[4*10*126*10];
1298 int uni2charset[0x10000];
1299 struct { int low; int high; int diff; int total; } ranges[256];
1300 int ranges_count, ranges_total;
1302 for (i = 0; i < 4*10*126*10; i++)
1303 charset2uni[i] = 0;
1304 for (j = 0; j < 0x10000; j++)
1305 uni2charset[j] = 0;
1307 /* Read a unicode.org style .TXT file. */
1308 for (;;) {
1309 c = getc(stdin);
1310 if (c == EOF)
1311 break;
1312 if (c == '\n' || c == ' ' || c == '\t')
1313 continue;
1314 if (c == '#') {
1315 do { c = getc(stdin); } while (!(c == EOF || c == '\n'));
1316 continue;
1318 ungetc(c,stdin);
1319 if (scanf("0x%x", &bytes) != 1)
1320 exit(1);
1321 i1 = (bytes >> 24) & 0xff;
1322 i2 = (bytes >> 16) & 0xff;
1323 i3 = (bytes >> 8) & 0xff;
1324 i4 = bytes & 0xff;
1325 if (!(i1 >= 0x81 && i1 <= 0x84
1326 && i2 >= 0x30 && i2 <= 0x39
1327 && i3 >= 0x81 && i3 <= 0xfe
1328 && i4 >= 0x30 && i4 <= 0x39)) {
1329 fprintf(stderr, "lost entry for %02x %02x %02x %02x\n", i1, i2, i3, i4);
1330 exit(1);
1332 i = (((i1-0x81) * 10 + (i2-0x30)) * 126 + (i3-0x81)) * 10 + (i4-0x30);
1333 if (scanf(" 0x%x", &j) != 1)
1334 exit(1);
1335 if (!(j >= 0 && j < 0x10000))
1336 exit(1);
1337 charset2uni[i] = j;
1338 uni2charset[j] = i;
1341 /* Verify that the mapping i -> j is monotonically increasing and
1342 of the form
1343 low[k] <= i <= high[k] => j = diff[k] + i
1344 with a set of disjoint intervals (low[k], high[k]). */
1345 ranges_count = 0;
1346 for (i = 0; i < 4*10*126*10; i++)
1347 if (charset2uni[i] != 0) {
1348 int diff;
1349 j = charset2uni[i];
1350 diff = j - i;
1351 if (ranges_count > 0) {
1352 if (!(i > ranges[ranges_count-1].high))
1353 exit(1);
1354 if (!(j > ranges[ranges_count-1].high + ranges[ranges_count-1].diff))
1355 exit(1);
1356 /* Additional property: The diffs are also increasing. */
1357 if (!(diff >= ranges[ranges_count-1].diff))
1358 exit(1);
1360 if (ranges_count > 0 && diff == ranges[ranges_count-1].diff)
1361 ranges[ranges_count-1].high = i;
1362 else {
1363 if (ranges_count == 256)
1364 exit(1);
1365 ranges[ranges_count].low = i;
1366 ranges[ranges_count].high = i;
1367 ranges[ranges_count].diff = diff;
1368 ranges_count++;
1372 /* Determine size of bitmap. */
1373 ranges_total = 0;
1374 for (k = 0; k < ranges_count; k++) {
1375 ranges[k].total = ranges_total;
1376 ranges_total += ranges[k].high - ranges[k].low + 1;
1379 printf("static const unsigned short %s_charset2uni_ranges[%d] = {\n", name, 2*ranges_count);
1380 for (k = 0; k < ranges_count; k++) {
1381 printf(" 0x%04x, 0x%04x", ranges[k].low, ranges[k].high);
1382 if (k+1 < ranges_count) printf(",");
1383 if ((k % 4) == 3 && k+1 < ranges_count) printf("\n");
1385 printf("\n");
1386 printf("};\n");
1388 printf("\n");
1390 printf("static const unsigned short %s_uni2charset_ranges[%d] = {\n", name, 2*ranges_count);
1391 for (k = 0; k < ranges_count; k++) {
1392 printf(" 0x%04x, 0x%04x", ranges[k].low + ranges[k].diff, ranges[k].high + ranges[k].diff);
1393 if (k+1 < ranges_count) printf(",");
1394 if ((k % 4) == 3 && k+1 < ranges_count) printf("\n");
1396 printf("\n");
1397 printf("};\n");
1399 printf("\n");
1401 printf("static const struct { unsigned short diff; unsigned short bitmap_offset; } %s_ranges[%d] = {\n ", name, ranges_count);
1402 for (k = 0; k < ranges_count; k++) {
1403 printf(" { %5d, 0x%04x }", ranges[k].diff, ranges[k].total);
1404 if (k+1 < ranges_count) printf(",");
1405 if ((k % 4) == 3 && k+1 < ranges_count) printf("\n ");
1407 printf("\n");
1408 printf("};\n");
1410 printf("\n");
1412 printf("static const unsigned char %s_bitmap[%d] = {\n ", name, (ranges_total + 7) / 8);
1414 int accu = 0;
1415 for (k = 0; k < ranges_count; k++) {
1416 for (i = ranges[k].total; i <= ranges[k].total + (ranges[k].high - ranges[k].low);) {
1417 if (charset2uni[i - ranges[k].total + ranges[k].low] != 0)
1418 accu |= (1 << (i % 8));
1419 i++;
1420 if ((i % 8) == 0) {
1421 printf(" 0x%02x", accu);
1422 if ((i / 8) < (ranges_total + 7) / 8) printf(",");
1423 if (((i / 8) % 12) == 0)
1424 printf("\n ");
1425 accu = 0;
1428 if (i != (k+1 < ranges_count ? ranges[k+1].total : ranges_total)) abort();
1430 if ((ranges_total % 8) != 0)
1431 printf(" 0x%02x", accu);
1432 printf("\n");
1434 printf("};\n");
1436 printf("\n");
1438 printf("static int\n");
1439 printf("%s_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, int n)\n", name);
1440 printf("{\n");
1441 printf(" unsigned char c1 = s[0];\n");
1442 printf(" if (c1 >= 0x81 && c1 <= 0x84) {\n");
1443 printf(" if (n >= 2) {\n");
1444 printf(" unsigned char c2 = s[1];\n");
1445 printf(" if (c2 >= 0x30 && c2 <= 0x39) {\n");
1446 printf(" if (n >= 3) {\n");
1447 printf(" unsigned char c3 = s[2];\n");
1448 printf(" if (c3 >= 0x81 && c3 <= 0xfe) {\n");
1449 printf(" if (n >= 4) {\n");
1450 printf(" unsigned char c4 = s[3];\n");
1451 printf(" if (c4 >= 0x30 && c4 <= 0x39) {\n");
1452 printf(" unsigned int i = (((c1 - 0x81) * 10 + (c2 - 0x30)) * 126 + (c3 - 0x81)) * 10 + (c4 - 0x30);\n");
1453 printf(" if (i >= %d && i <= %d) {\n", ranges[0].low, ranges[ranges_count-1].high);
1454 printf(" unsigned int k1 = 0;\n");
1455 printf(" unsigned int k2 = %d;\n", ranges_count-1);
1456 printf(" while (k1 < k2) {\n");
1457 printf(" unsigned int k = (k1 + k2) / 2;\n");
1458 printf(" if (i <= %s_charset2uni_ranges[2*k+1])\n", name);
1459 printf(" k2 = k;\n");
1460 printf(" else if (i >= %s_charset2uni_ranges[2*k+2])\n", name);
1461 printf(" k1 = k + 1;\n");
1462 printf(" else\n");
1463 printf(" return RET_ILSEQ;\n");
1464 printf(" }\n");
1465 printf(" {\n");
1466 printf(" unsigned int bitmap_index = i - %s_charset2uni_ranges[2*k1] + %s_ranges[k1].bitmap_offset;\n", name, name);
1467 printf(" if ((%s_bitmap[bitmap_index >> 3] >> (bitmap_index & 7)) & 1) {\n", name);
1468 printf(" unsigned int diff = %s_ranges[k1].diff;\n", name);
1469 printf(" *pwc = (ucs4_t) (i + diff);\n");
1470 printf(" return 4;\n");
1471 printf(" }\n");
1472 printf(" }\n");
1473 printf(" }\n");
1474 printf(" }\n");
1475 printf(" return RET_ILSEQ;\n");
1476 printf(" }\n");
1477 printf(" return RET_TOOFEW(0);\n");
1478 printf(" }\n");
1479 printf(" return RET_ILSEQ;\n");
1480 printf(" }\n");
1481 printf(" return RET_TOOFEW(0);\n");
1482 printf(" }\n");
1483 printf(" return RET_ILSEQ;\n");
1484 printf(" }\n");
1485 printf(" return RET_TOOFEW(0);\n");
1486 printf(" }\n");
1487 printf(" return RET_ILSEQ;\n");
1488 printf("}\n");
1490 printf("\n");
1492 printf("static int\n");
1493 printf("%s_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, int n)\n", name);
1494 printf("{\n");
1495 printf(" if (n >= 4) {\n");
1496 printf(" unsigned int i = wc;\n");
1497 printf(" if (i >= 0x%04x && i <= 0x%04x) {\n", ranges[0].low + ranges[0].diff, ranges[ranges_count-1].high + ranges[ranges_count-1].diff);
1498 printf(" unsigned int k1 = 0;\n");
1499 printf(" unsigned int k2 = %d;\n", ranges_count-1);
1500 printf(" while (k1 < k2) {\n");
1501 printf(" unsigned int k = (k1 + k2) / 2;\n");
1502 printf(" if (i <= %s_uni2charset_ranges[2*k+1])\n", name);
1503 printf(" k2 = k;\n");
1504 printf(" else if (i >= %s_uni2charset_ranges[2*k+2])\n", name);
1505 printf(" k1 = k + 1;\n");
1506 printf(" else\n");
1507 printf(" return RET_ILSEQ;\n");
1508 printf(" }\n");
1509 printf(" {\n");
1510 printf(" unsigned int bitmap_index = i - %s_uni2charset_ranges[2*k1] + %s_ranges[k1].bitmap_offset;\n", name, name);
1511 printf(" if ((%s_bitmap[bitmap_index >> 3] >> (bitmap_index & 7)) & 1) {\n", name);
1512 printf(" unsigned int diff = %s_ranges[k1].diff;\n", name);
1513 printf(" i -= diff;\n");
1514 printf(" r[3] = (i %% 10) + 0x30; i = i / 10;\n");
1515 printf(" r[2] = (i %% 126) + 0x81; i = i / 126;\n");
1516 printf(" r[1] = (i %% 10) + 0x30; i = i / 10;\n");
1517 printf(" r[0] = i + 0x81;\n");
1518 printf(" return 4;\n");
1519 printf(" }\n");
1520 printf(" }\n");
1521 printf(" }\n");
1522 printf(" return RET_ILSEQ;\n");
1523 printf(" }\n");
1524 printf(" return RET_TOOSMALL;\n");
1525 printf("}\n");
1528 /* Main program */
1530 int main (int argc, char *argv[])
1532 const char* charsetname;
1533 const char* name;
1535 if (argc != 3)
1536 exit(1);
1537 charsetname = argv[1];
1538 name = argv[2];
1540 output_title(charsetname);
1542 if (!strcmp(name,"gb2312")
1543 || !strcmp(name,"isoir165ext") || !strcmp(name,"gb12345ext")
1544 || !strcmp(name,"jisx0208") || !strcmp(name,"jisx0212"))
1545 do_normal(name);
1546 else if (!strcmp(name,"cns11643_1") || !strcmp(name,"cns11643_2")
1547 || !strcmp(name,"cns11643_3"))
1548 do_normal_only_charset2uni(name);
1549 else if (!strcmp(name,"cns11643_inv"))
1550 do_cns11643_only_uni2charset(name);
1551 else if (!strcmp(name,"gbkext1"))
1552 do_gbk1_only_charset2uni(name);
1553 else if (!strcmp(name,"gbkext2"))
1554 do_gbk2_only_charset2uni(name);
1555 else if (!strcmp(name,"gbkext_inv"))
1556 do_gbk1_only_uni2charset(name);
1557 else if (!strcmp(name,"cp936ext") || !strcmp(name,"gb18030ext"))
1558 do_gbk1(name);
1559 else if (!strcmp(name,"ksc5601"))
1560 do_ksc5601(name);
1561 else if (!strcmp(name,"uhc_1"))
1562 do_uhc_1(name);
1563 else if (!strcmp(name,"uhc_2"))
1564 do_uhc_2(name);
1565 else if (!strcmp(name,"big5") || !strcmp(name,"cp950ext"))
1566 do_big5(name);
1567 else if (!strcmp(name,"hkscs"))
1568 do_hkscs(name);
1569 else if (!strcmp(name,"johab_hangul"))
1570 do_johab_hangul(name);
1571 else if (!strcmp(name,"cp932ext"))
1572 do_sjis(name);
1573 else if (!strcmp(name,"gb18030uni"))
1574 do_gb18030uni(name);
1575 else
1576 exit(1);
1578 return 0;