Support for "iconv -c".
[libiconv.git] / tools / cjk_tab_to_h.c
blob357526a2bc58876f78b80f9104c85b7b64d2625f
1 /* Copyright (C) 1999-2001 Free Software Foundation, Inc.
2 This file is part of the GNU LIBICONV Tools.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
7 any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software Foundation,
16 Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
19 * Generates a CJK character set table from a .TXT table as found on
20 * ftp.unicode.org or in the X nls directory.
21 * Examples:
23 * ./cjk_tab_to_h GB2312.1980-0 gb2312 > gb2312.h < gb2312
24 * ./cjk_tab_to_h JISX0208.1983-0 jisx0208 > jisx0208.h < jis0208
25 * ./cjk_tab_to_h KSC5601.1987-0 ksc5601 > ksc5601.h < ksc5601
27 * ./cjk_tab_to_h GB2312.1980-0 gb2312 > gb2312.h < GB2312.TXT
28 * ./cjk_tab_to_h JISX0208.1983-0 jisx0208 > jisx0208.h < JIS0208.TXT
29 * ./cjk_tab_to_h JISX0212.1990-0 jisx0212 > jisx0212.h < JIS0212.TXT
30 * ./cjk_tab_to_h KSC5601.1987-0 ksc5601 > ksc5601.h < KSC5601.TXT
31 * ./cjk_tab_to_h KSX1001.1992-0 ksc5601 > ksc5601.h < KSX1001.TXT
33 * ./cjk_tab_to_h BIG5 big5 > big5.h < BIG5.TXT
35 * ./cjk_tab_to_h JOHAB johab > johab.h < JOHAB.TXT
38 #include <stdio.h>
39 #include <stdlib.h>
40 #include <stdbool.h>
41 #include <string.h>
43 typedef struct {
44 int start;
45 int end;
46 } Block;
48 typedef struct {
49 int rows; /* number of possible values for the 1st byte */
50 int cols; /* number of possible values for the 2nd byte */
51 int (*row_byte) (int row); /* returns the 1st byte value for a given row */
52 int (*col_byte) (int col); /* returns the 2nd byte value for a given col */
53 int (*byte_row) (int byte); /* converts a 1st byte value to a row, else -1 */
54 int (*byte_col) (int byte); /* converts a 2nd byte value to a col, else -1 */
55 const char* check_row_expr; /* format string for 1st byte value checking */
56 const char* check_col_expr; /* format string for 2nd byte value checking */
57 const char* byte_row_expr; /* format string for 1st byte value to row */
58 const char* byte_col_expr; /* format string for 2nd byte value to col */
59 int** charset2uni; /* charset2uni[0..rows-1][0..cols-1] is valid */
60 /* You'll understand the terms "row" and "col" when you buy Ken Lunde's book.
61 Once a row is fixed, choosing a "col" is the same as choosing a "cell". */
62 int* charsetpage; /* charsetpage[0..rows]: how large is a page for a row */
63 int ncharsetblocks;
64 Block* charsetblocks; /* blocks[0..nblocks-1] */
65 int* uni2charset; /* uni2charset[0x0000..0xffff] */
66 int fffd; /* uni representation of the invalid character */
67 } Encoding;
70 * Outputs the file title.
72 static void output_title (const char *charsetname)
74 printf("/*\n");
75 printf(" * Copyright (C) 1999-2001 Free Software Foundation, Inc.\n");
76 printf(" * This file is part of the GNU LIBICONV Library.\n");
77 printf(" *\n");
78 printf(" * The GNU LIBICONV Library is free software; you can redistribute it\n");
79 printf(" * and/or modify it under the terms of the GNU Library General Public\n");
80 printf(" * License as published by the Free Software Foundation; either version 2\n");
81 printf(" * of the License, or (at your option) any later version.\n");
82 printf(" *\n");
83 printf(" * The GNU LIBICONV Library is distributed in the hope that it will be\n");
84 printf(" * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
85 printf(" * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU\n");
86 printf(" * Library General Public License for more details.\n");
87 printf(" *\n");
88 printf(" * You should have received a copy of the GNU Library General Public\n");
89 printf(" * License along with the GNU LIBICONV Library; see the file COPYING.LIB.\n");
90 printf(" * If not, write to the Free Software Foundation, Inc., 59 Temple Place -\n");
91 printf(" * Suite 330, Boston, MA 02111-1307, USA.\n");
92 printf(" */\n");
93 printf("\n");
94 printf("/*\n");
95 printf(" * %s\n", charsetname);
96 printf(" */\n");
97 printf("\n");
101 * Reads the charset2uni table from standard input.
103 static void read_table (Encoding* enc)
105 int row, col, i, i1, i2, c, j;
107 enc->charset2uni = (int**) malloc(enc->rows*sizeof(int*));
108 for (row = 0; row < enc->rows; row++)
109 enc->charset2uni[row] = (int*) malloc(enc->cols*sizeof(int));
111 for (row = 0; row < enc->rows; row++)
112 for (col = 0; col < enc->cols; col++)
113 enc->charset2uni[row][col] = 0xfffd;
115 c = getc(stdin);
116 ungetc(c,stdin);
117 if (c == '#') {
118 /* Read a unicode.org style .TXT file. */
119 for (;;) {
120 c = getc(stdin);
121 if (c == EOF)
122 break;
123 if (c == '\n' || c == ' ' || c == '\t')
124 continue;
125 if (c == '#') {
126 do { c = getc(stdin); } while (!(c == EOF || c == '\n'));
127 continue;
129 ungetc(c,stdin);
130 if (scanf("0x%x", &j) != 1)
131 exit(1);
132 i1 = j >> 8;
133 i2 = j & 0xff;
134 row = enc->byte_row(i1);
135 col = enc->byte_col(i2);
136 if (row < 0 || col < 0) {
137 fprintf(stderr, "lost entry for %02x %02x\n", i1, i2);
138 exit(1);
140 if (scanf(" 0x%x", &enc->charset2uni[row][col]) != 1)
141 exit(1);
143 } else {
144 /* Read a table of hexadecimal Unicode values. */
145 for (i1 = 32; i1 < 132; i1++)
146 for (i2 = 32; i2 < 132; i2++) {
147 i = scanf("%x", &j);
148 if (i == EOF)
149 goto read_done;
150 if (i != 1)
151 exit(1);
152 if (j < 0 || j == 0xffff)
153 j = 0xfffd;
154 if (j != 0xfffd) {
155 if (enc->byte_row(i1) < 0 || enc->byte_col(i2) < 0) {
156 fprintf(stderr, "lost entry at %02x %02x\n", i1, i2);
157 exit (1);
159 enc->charset2uni[enc->byte_row(i1)][enc->byte_col(i2)] = j;
162 read_done: ;
167 * Determine whether the Unicode range goes outside the BMP.
169 static bool is_charset2uni_large (Encoding* enc)
171 int row, col;
173 for (row = 0; row < enc->rows; row++)
174 for (col = 0; col < enc->cols; col++)
175 if (enc->charset2uni[row][col] >= 0x10000)
176 return true;
177 return false;
181 * Compactify the Unicode range by use of an auxiliary table,
182 * so 16 bits suffice to store each value.
184 static int compact_large_charset2uni (Encoding* enc, unsigned int **urows)
186 int upages[0x1100];
187 int i, row, col, nurows;
189 for (i = 0; i < 0x1100; i++)
190 upages[i] = -1;
192 for (row = 0; row < enc->rows; row++)
193 for (col = 0; col < enc->cols; col++)
194 upages[enc->charset2uni[row][col] >> 8] = 0;
196 nurows = 0;
197 for (i = 0; i < 0x1100; i++)
198 if (upages[i] == 0)
199 nurows++;
201 *urows = (unsigned int *) malloc(nurows * sizeof(unsigned int));
203 nurows = 0;
204 for (i = 0; i < 0x1100; i++)
205 if (upages[i] == 0) {
206 upages[i] = nurows;
207 (*urows)[nurows] = i;
208 nurows++;
211 for (row = 0; row < enc->rows; row++)
212 for (col = 0; col < enc->cols; col++) {
213 int u = enc->charset2uni[row][col];
214 enc->charset2uni[row][col] = (upages[u >> 8] << 8) | (u & 0xFF);
216 enc->fffd = (upages[0xfffd >> 8] << 8) | (0xfffd & 0xFF);
218 return nurows;
222 * Computes the charsetpage[0..rows] array.
224 static void find_charset2uni_pages (Encoding* enc)
226 int row, col;
228 enc->charsetpage = (int*) malloc((enc->rows+1)*sizeof(int));
230 for (row = 0; row <= enc->rows; row++)
231 enc->charsetpage[row] = 0;
233 for (row = 0; row < enc->rows; row++) {
234 int used = 0;
235 for (col = 0; col < enc->cols; col++)
236 if (enc->charset2uni[row][col] != enc->fffd)
237 used = col+1;
238 enc->charsetpage[row] = used;
243 * Fills in nblocks and blocks.
245 static void find_charset2uni_blocks (Encoding* enc)
247 int n, row, lastrow;
249 enc->charsetblocks = (Block*) malloc(enc->rows*sizeof(Block));
251 n = 0;
252 for (row = 0; row < enc->rows; row++)
253 if (enc->charsetpage[row] > 0 && (row == 0 || enc->charsetpage[row-1] == 0)) {
254 for (lastrow = row; enc->charsetpage[lastrow+1] > 0; lastrow++);
255 enc->charsetblocks[n].start = row * enc->cols;
256 enc->charsetblocks[n].end = lastrow * enc->cols + enc->charsetpage[lastrow];
257 n++;
259 enc->ncharsetblocks = n;
263 * Outputs the charset to unicode table and function.
265 static void output_charset2uni (const char* name, Encoding* enc)
267 int nurows, row, col, lastrow, col_max, i, i1_min, i1_max;
268 bool is_large;
269 unsigned int* urows;
271 is_large = is_charset2uni_large(enc);
272 if (is_large) {
273 nurows = compact_large_charset2uni(enc,&urows);
274 } else {
275 nurows = 0; urows = NULL; enc->fffd = 0xfffd;
278 find_charset2uni_pages(enc);
280 find_charset2uni_blocks(enc);
282 for (row = 0; row < enc->rows; row++)
283 if (enc->charsetpage[row] > 0) {
284 if (row == 0 || enc->charsetpage[row-1] == 0) {
285 /* Start a new block. */
286 for (lastrow = row; enc->charsetpage[lastrow+1] > 0; lastrow++);
287 printf("static const unsigned short %s_2uni_page%02x[%d] = {\n",
288 name, enc->row_byte(row),
289 (lastrow-row) * enc->cols + enc->charsetpage[lastrow]);
291 printf(" /""* 0x%02x *""/\n ", enc->row_byte(row));
292 col_max = (enc->charsetpage[row+1] > 0 ? enc->cols : enc->charsetpage[row]);
293 for (col = 0; col < col_max; col++) {
294 printf(" 0x%04x,", enc->charset2uni[row][col]);
295 if ((col % 8) == 7 && (col+1 < col_max)) printf("\n ");
297 printf("\n");
298 if (enc->charsetpage[row+1] == 0) {
299 /* End a block. */
300 printf("};\n");
303 printf("\n");
305 if (is_large) {
306 printf("static const ucs4_t %s_2uni_upages[%d] = {\n ", name, nurows);
307 for (i = 0; i < nurows; i++) {
308 printf(" 0x%05x,", urows[i] << 8);
309 if ((i % 8) == 7 && (i+1 < nurows)) printf("\n ");
311 printf("\n");
312 printf("};\n");
313 printf("\n");
316 printf("static int\n");
317 printf("%s_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, int n)\n", name);
318 printf("{\n");
319 printf(" unsigned char c1 = s[0];\n");
320 printf(" if (");
321 for (i = 0; i < enc->ncharsetblocks; i++) {
322 i1_min = enc->row_byte(enc->charsetblocks[i].start / enc->cols);
323 i1_max = enc->row_byte((enc->charsetblocks[i].end-1) / enc->cols);
324 if (i > 0)
325 printf(" || ");
326 if (i1_min == i1_max)
327 printf("(c1 == 0x%02x)", i1_min);
328 else
329 printf("(c1 >= 0x%02x && c1 <= 0x%02x)", i1_min, i1_max);
331 printf(") {\n");
332 printf(" if (n >= 2) {\n");
333 printf(" unsigned char c2 = s[1];\n");
334 printf(" if (");
335 printf(enc->check_col_expr, "c2");
336 printf(") {\n");
337 printf(" unsigned int i = %d * (", enc->cols);
338 printf(enc->byte_row_expr, "c1");
339 printf(") + (");
340 printf(enc->byte_col_expr, "c2");
341 printf(");\n");
342 printf(" %s wc = 0xfffd;\n", is_large ? "ucs4_t" : "unsigned short");
343 if (is_large) printf(" unsigned short swc;\n");
344 for (i = 0; i < enc->ncharsetblocks; i++) {
345 printf(" ");
346 if (i > 0)
347 printf("} else ");
348 if (i < enc->ncharsetblocks-1)
349 printf("if (i < %d) ", enc->charsetblocks[i+1].start);
350 printf("{\n");
351 printf(" if (i < %d)\n", enc->charsetblocks[i].end);
352 printf(" %s = ", is_large ? "swc" : "wc");
353 printf("%s_2uni_page%02x[i", name, enc->row_byte(enc->charsetblocks[i].start / enc->cols));
354 if (enc->charsetblocks[i].start > 0)
355 printf("-%d", enc->charsetblocks[i].start);
356 printf("]");
357 if (is_large) printf(",\n wc = %s_2uni_upages[swc>>8] | (swc & 0xff)", name);
358 printf(";\n");
360 printf(" }\n");
361 printf(" if (wc != 0xfffd) {\n");
362 printf(" *pwc = %swc;\n", is_large ? "" : "(ucs4_t) ");
363 printf(" return 2;\n");
364 printf(" }\n");
365 printf(" }\n");
366 printf(" return RET_ILSEQ;\n");
367 printf(" }\n");
368 printf(" return RET_TOOFEW(0);\n");
369 printf(" }\n");
370 printf(" return RET_ILSEQ;\n");
371 printf("}\n");
372 printf("\n");
376 * Outputs the charset to unicode table and function.
377 * (Suitable if the mapping function is well defined, i.e. has no holes, and
378 * is monotonically increasing with small gaps only.)
380 static void output_charset2uni_noholes_monotonic (const char* name, Encoding* enc)
382 int row, col, lastrow, r, col_max, i, i1_min, i1_max;
384 /* Choose stepsize so that stepsize*steps_per_row >= enc->cols, and
385 enc->charset2uni[row][col] - enc->charset2uni[row][col/stepsize*stepsize]
386 is always < 0x100. */
387 int steps_per_row = 2;
388 int stepsize = (enc->cols + steps_per_row-1) / steps_per_row;
390 find_charset2uni_pages(enc);
392 find_charset2uni_blocks(enc);
394 for (row = 0; row < enc->rows; row++)
395 if (enc->charsetpage[row] > 0) {
396 if (row == 0 || enc->charsetpage[row-1] == 0) {
397 /* Start a new block. */
398 for (lastrow = row; enc->charsetpage[lastrow+1] > 0; lastrow++);
399 printf("static const unsigned short %s_2uni_main_page%02x[%d] = {\n ",
400 name, enc->row_byte(row),
401 steps_per_row*(lastrow-row+1));
402 for (r = row; r <= lastrow; r++) {
403 for (i = 0; i < steps_per_row; i++)
404 printf(" 0x%04x,", enc->charset2uni[r][i*stepsize]);
405 if (((r-row) % 4) == 3 && (r < lastrow)) printf("\n ");
407 printf("\n");
408 printf("};\n");
409 printf("static const unsigned char %s_2uni_page%02x[%d] = {\n",
410 name, enc->row_byte(row),
411 (lastrow-row) * enc->cols + enc->charsetpage[lastrow]);
413 printf(" /""* 0x%02x *""/\n ", enc->row_byte(row));
414 col_max = (enc->charsetpage[row+1] > 0 ? enc->cols : enc->charsetpage[row]);
415 for (col = 0; col < col_max; col++) {
416 printf(" 0x%02x,", enc->charset2uni[row][col] - enc->charset2uni[row][col/stepsize*stepsize]);
417 if ((col % 8) == 7 && (col+1 < col_max)) printf("\n ");
419 printf("\n");
420 if (enc->charsetpage[row+1] == 0) {
421 /* End a block. */
422 printf("};\n");
425 printf("\n");
427 printf("static int\n");
428 printf("%s_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, int n)\n", name);
429 printf("{\n");
430 printf(" unsigned char c1 = s[0];\n");
431 printf(" if (");
432 for (i = 0; i < enc->ncharsetblocks; i++) {
433 i1_min = enc->row_byte(enc->charsetblocks[i].start / enc->cols);
434 i1_max = enc->row_byte((enc->charsetblocks[i].end-1) / enc->cols);
435 if (i > 0)
436 printf(" || ");
437 if (i1_min == i1_max)
438 printf("(c1 == 0x%02x)", i1_min);
439 else
440 printf("(c1 >= 0x%02x && c1 <= 0x%02x)", i1_min, i1_max);
442 printf(") {\n");
443 printf(" if (n >= 2) {\n");
444 printf(" unsigned char c2 = s[1];\n");
445 printf(" if (");
446 printf(enc->check_col_expr, "c2");
447 printf(") {\n");
448 printf(" unsigned int row = ");
449 printf(enc->byte_row_expr, "c1");
450 printf(";\n");
451 printf(" unsigned int col = ");
452 printf(enc->byte_col_expr, "c2");
453 printf(";\n");
454 printf(" unsigned int i = %d * row + col;\n", enc->cols);
455 printf(" unsigned short wc = 0xfffd;\n");
456 for (i = 0; i < enc->ncharsetblocks; i++) {
457 printf(" ");
458 if (i > 0)
459 printf("} else ");
460 if (i < enc->ncharsetblocks-1)
461 printf("if (i < %d) ", enc->charsetblocks[i+1].start);
462 printf("{\n");
463 printf(" if (i < %d)\n", enc->charsetblocks[i].end);
464 printf(" wc = %s_2uni_main_page%02x[%d*", name, enc->row_byte(enc->charsetblocks[i].start / enc->cols), steps_per_row);
465 if (enc->charsetblocks[i].start > 0)
466 printf("(row-%d)", enc->charsetblocks[i].start / enc->cols);
467 else
468 printf("row");
469 printf("+");
470 if (steps_per_row == 2)
471 printf("(col>=%d?1:0)", stepsize);
472 else
473 printf("col/%d", stepsize);
474 printf("] + %s_2uni_page%02x[i", name, enc->row_byte(enc->charsetblocks[i].start / enc->cols));
475 if (enc->charsetblocks[i].start > 0)
476 printf("-%d", enc->charsetblocks[i].start);
477 printf("];\n");
479 printf(" }\n");
480 printf(" if (wc != 0xfffd) {\n");
481 printf(" *pwc = (ucs4_t) wc;\n");
482 printf(" return 2;\n");
483 printf(" }\n");
484 printf(" }\n");
485 printf(" return RET_ILSEQ;\n");
486 printf(" }\n");
487 printf(" return RET_TOOFEW(0);\n");
488 printf(" }\n");
489 printf(" return RET_ILSEQ;\n");
490 printf("}\n");
491 printf("\n");
495 * Computes the uni2charset[0x0000..0x2ffff] array.
497 static void invert (Encoding* enc)
499 int row, col, j;
501 enc->uni2charset = (int*) malloc(0x30000*sizeof(int));
503 for (j = 0; j < 0x30000; j++)
504 enc->uni2charset[j] = 0;
506 for (row = 0; row < enc->rows; row++)
507 for (col = 0; col < enc->cols; col++) {
508 j = enc->charset2uni[row][col];
509 if (j != 0xfffd)
510 enc->uni2charset[j] = 0x100 * enc->row_byte(row) + enc->col_byte(col);
515 * Outputs the unicode to charset table and function, using a linear array.
516 * (Suitable if the table is dense.)
518 static void output_uni2charset_dense (const char* name, Encoding* enc)
520 /* Like in 8bit_tab_to_h.c */
521 bool pages[0x300];
522 int line[0x6000];
523 int tableno;
524 struct { int minline; int maxline; int usecount; } tables[0x6000];
525 bool first;
526 int row, col, j, p, j1, j2, t;
528 for (p = 0; p < 0x300; p++)
529 pages[p] = false;
530 for (row = 0; row < enc->rows; row++)
531 for (col = 0; col < enc->cols; col++) {
532 j = enc->charset2uni[row][col];
533 if (j != 0xfffd)
534 pages[j>>8] = true;
536 for (j1 = 0; j1 < 0x6000; j1++) {
537 bool all_invalid = true;
538 for (j2 = 0; j2 < 8; j2++) {
539 j = 8*j1+j2;
540 if (enc->uni2charset[j] != 0)
541 all_invalid = false;
543 if (all_invalid)
544 line[j1] = -1;
545 else
546 line[j1] = 0;
548 tableno = 0;
549 for (j1 = 0; j1 < 0x6000; j1++) {
550 if (line[j1] >= 0) {
551 if (tableno > 0
552 && ((j1 > 0 && line[j1-1] == tableno-1)
553 || ((tables[tableno-1].maxline >> 5) == (j1 >> 5)
554 && j1 - tables[tableno-1].maxline <= 8))) {
555 line[j1] = tableno-1;
556 tables[tableno-1].maxline = j1;
557 } else {
558 tableno++;
559 line[j1] = tableno-1;
560 tables[tableno-1].minline = tables[tableno-1].maxline = j1;
564 for (t = 0; t < tableno; t++) {
565 tables[t].usecount = 0;
566 j1 = 8*tables[t].minline;
567 j2 = 8*(tables[t].maxline+1);
568 for (j = j1; j < j2; j++)
569 if (enc->uni2charset[j] != 0)
570 tables[t].usecount++;
573 p = -1;
574 for (t = 0; t < tableno; t++)
575 if (tables[t].usecount > 1) {
576 p = tables[t].minline >> 5;
577 printf("static const unsigned short %s_page%02x[%d] = {\n", name, p, 8*(tables[t].maxline-tables[t].minline+1));
578 for (j1 = tables[t].minline; j1 <= tables[t].maxline; j1++) {
579 if ((j1 % 0x20) == 0 && j1 > tables[t].minline)
580 printf(" /* 0x%04x */\n", 8*j1);
581 printf(" ");
582 for (j2 = 0; j2 < 8; j2++) {
583 j = 8*j1+j2;
584 printf(" 0x%04x,", enc->uni2charset[j]);
586 printf(" /*0x%02x-0x%02x*/\n", 8*(j1 % 0x20), 8*(j1 % 0x20)+7);
588 printf("};\n");
590 if (p >= 0)
591 printf("\n");
593 printf("static int\n%s_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, int n)\n", name);
594 printf("{\n");
595 printf(" if (n >= 2) {\n");
596 printf(" unsigned short c = 0;\n");
597 first = true;
598 for (j1 = 0; j1 < 0x6000;) {
599 t = line[j1];
600 for (j2 = j1; j2 < 0x6000 && line[j2] == t; j2++);
601 if (t >= 0) {
602 if (j1 != tables[t].minline) abort();
603 if (j2 > tables[t].maxline+1) abort();
604 j2 = tables[t].maxline+1;
605 if (first)
606 printf(" ");
607 else
608 printf(" else ");
609 first = false;
610 if (tables[t].usecount == 0) abort();
611 if (tables[t].usecount == 1) {
612 if (j2 != j1+1) abort();
613 for (j = 8*j1; j < 8*j2; j++)
614 if (enc->uni2charset[j] != 0) {
615 printf("if (wc == 0x%04x)\n c = 0x%02x;\n", j, enc->uni2charset[j]);
616 break;
618 } else {
619 if (j1 == 0) {
620 printf("if (wc < 0x%04x)", 8*j2);
621 } else {
622 printf("if (wc >= 0x%04x && wc < 0x%04x)", 8*j1, 8*j2);
624 printf("\n c = %s_page%02x[wc", name, j1 >> 5);
625 if (tables[t].minline > 0)
626 printf("-0x%04x", 8*j1);
627 printf("];\n");
630 j1 = j2;
632 printf(" if (c != 0) {\n");
633 printf(" r[0] = (c >> 8); r[1] = (c & 0xff);\n");
634 printf(" return 2;\n");
635 printf(" }\n");
636 printf(" return RET_ILUNI;\n");
637 printf(" }\n");
638 printf(" return RET_TOOSMALL;\n");
639 printf("}\n");
643 * Outputs the unicode to charset table and function, using a packed array.
644 * (Suitable if the table is sparse.)
645 * The argument 'monotonic' may be set to true if the mapping is monotonically
646 * increasing with small gaps only.
648 static void output_uni2charset_sparse (const char* name, Encoding* enc, bool monotonic)
650 bool pages[0x300];
651 Block pageblocks[0x300]; int npageblocks;
652 int indx2charset[0x30000];
653 int summary_indx[0x3000];
654 int summary_used[0x3000];
655 int i, row, col, j, p, j1, j2, indx;
656 bool is_large;
657 /* for monotonic: */
658 int log2_stepsize = (!strcmp(name,"uhc_2") ? 6 : 7);
659 int stepsize = 1 << log2_stepsize;
660 int indxsteps;
662 /* Fill pages[0x300]. */
663 for (p = 0; p < 0x300; p++)
664 pages[p] = false;
665 for (row = 0; row < enc->rows; row++)
666 for (col = 0; col < enc->cols; col++) {
667 j = enc->charset2uni[row][col];
668 if (j != 0xfffd)
669 pages[j>>8] = true;
672 /* Determine whether two or three bytes are needed for each character. */
673 is_large = false;
674 for (j = 0; j < 0x30000; j++)
675 if (enc->uni2charset[j] >= 0x10000)
676 is_large = true;
678 #if 0
679 for (p = 0; p < 0x300; p++)
680 if (pages[p]) {
681 printf("static const unsigned short %s_page%02x[256] = {\n", name, p);
682 for (j1 = 0; j1 < 32; j1++) {
683 printf(" ");
684 for (j2 = 0; j2 < 8; j2++)
685 printf("0x%04x, ", enc->uni2charset[256*p+8*j1+j2]);
686 printf("/""*0x%02x-0x%02x*""/\n", 8*j1, 8*j1+7);
688 printf("};\n");
690 printf("\n");
691 #endif
693 /* Fill summary_indx[] and summary_used[]. */
694 indx = 0;
695 for (j1 = 0; j1 < 0x3000; j1++) {
696 summary_indx[j1] = indx;
697 summary_used[j1] = 0;
698 for (j2 = 0; j2 < 16; j2++) {
699 j = 16*j1+j2;
700 if (enc->uni2charset[j] != 0) {
701 indx2charset[indx++] = enc->uni2charset[j];
702 summary_used[j1] |= (1 << j2);
707 /* Fill npageblocks and pageblocks[]. */
708 npageblocks = 0;
709 for (p = 0; p < 0x300; ) {
710 if (pages[p] && (p == 0 || !pages[p-1])) {
711 pageblocks[npageblocks].start = 16*p;
712 do p++; while (p < 0x300 && pages[p]);
713 j1 = 16*p;
714 while (summary_used[j1-1] == 0) j1--;
715 pageblocks[npageblocks].end = j1;
716 npageblocks++;
717 } else
718 p++;
721 if (monotonic) {
722 indxsteps = (indx + stepsize-1) / stepsize;
723 printf("static const unsigned short %s_2charset_main[%d] = {\n", name, indxsteps);
724 for (i = 0; i < indxsteps; ) {
725 if ((i % 8) == 0) printf(" ");
726 printf(" 0x%04x,", indx2charset[i*stepsize]);
727 i++;
728 if ((i % 8) == 0 || i == indxsteps) printf("\n");
730 printf("};\n");
731 printf("static const unsigned char %s_2charset[%d] = {\n", name, indx);
732 for (i = 0; i < indx; ) {
733 if ((i % 8) == 0) printf(" ");
734 printf(" 0x%02x,", indx2charset[i] - indx2charset[i/stepsize*stepsize]);
735 i++;
736 if ((i % 8) == 0 || i == indx) printf("\n");
738 printf("};\n");
739 } else {
740 if (is_large) {
741 printf("static const unsigned char %s_2charset[3*%d] = {\n", name, indx);
742 for (i = 0; i < indx; ) {
743 if ((i % 4) == 0) printf(" ");
744 printf(" 0x%1x,0x%02x,0x%02x,", indx2charset[i] >> 16,
745 (indx2charset[i] >> 8) & 0xff, indx2charset[i] & 0xff);
746 i++;
747 if ((i % 4) == 0 || i == indx) printf("\n");
749 printf("};\n");
750 } else {
751 printf("static const unsigned short %s_2charset[%d] = {\n", name, indx);
752 for (i = 0; i < indx; ) {
753 if ((i % 8) == 0) printf(" ");
754 printf(" 0x%04x,", indx2charset[i]);
755 i++;
756 if ((i % 8) == 0 || i == indx) printf("\n");
758 printf("};\n");
761 printf("\n");
762 for (i = 0; i < npageblocks; i++) {
763 printf("static const Summary16 %s_uni2indx_page%02x[%d] = {\n", name,
764 pageblocks[i].start/16, pageblocks[i].end-pageblocks[i].start);
765 for (j1 = pageblocks[i].start; j1 < pageblocks[i].end; ) {
766 if (((16*j1) % 0x100) == 0) printf(" /""* 0x%04x *""/\n", 16*j1);
767 if ((j1 % 4) == 0) printf(" ");
768 printf(" { %4d, 0x%04x },", summary_indx[j1], summary_used[j1]);
769 j1++;
770 if ((j1 % 4) == 0 || j1 == pageblocks[i].end) printf("\n");
772 printf("};\n");
774 printf("\n");
776 printf("static int\n");
777 printf("%s_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, int n)\n", name);
778 printf("{\n");
779 printf(" if (n >= 2) {\n");
780 printf(" const Summary16 *summary = NULL;\n");
781 for (i = 0; i < npageblocks; i++) {
782 printf(" ");
783 if (i > 0)
784 printf("else ");
785 printf("if (wc >= 0x%04x && wc < 0x%04x)\n",
786 16*pageblocks[i].start, 16*pageblocks[i].end);
787 printf(" summary = &%s_uni2indx_page%02x[(wc>>4)", name,
788 pageblocks[i].start/16);
789 if (pageblocks[i].start > 0)
790 printf("-0x%03x", pageblocks[i].start);
791 printf("];\n");
793 printf(" if (summary) {\n");
794 printf(" unsigned short used = summary->used;\n");
795 printf(" unsigned int i = wc & 0x0f;\n");
796 printf(" if (used & ((unsigned short) 1 << i)) {\n");
797 if (monotonic || !is_large)
798 printf(" unsigned short c;\n");
799 printf(" /* Keep in `used' only the bits 0..i-1. */\n");
800 printf(" used &= ((unsigned short) 1 << i) - 1;\n");
801 printf(" /* Add `summary->indx' and the number of bits set in `used'. */\n");
802 printf(" used = (used & 0x5555) + ((used & 0xaaaa) >> 1);\n");
803 printf(" used = (used & 0x3333) + ((used & 0xcccc) >> 2);\n");
804 printf(" used = (used & 0x0f0f) + ((used & 0xf0f0) >> 4);\n");
805 printf(" used = (used & 0x00ff) + (used >> 8);\n");
806 if (monotonic) {
807 printf(" used += summary->indx;\n");
808 printf(" c = %s_2charset_main[used>>%d] + %s_2charset[used];\n", name, log2_stepsize, name);
809 printf(" r[0] = (c >> 8); r[1] = (c & 0xff);\n");
810 printf(" return 2;\n");
811 } else {
812 if (is_large) {
813 printf(" used += summary->indx;\n");
814 printf(" r[0] = %s_2charset[3*used];\n", name);
815 printf(" r[1] = %s_2charset[3*used+1];\n", name);
816 printf(" r[2] = %s_2charset[3*used+2];\n", name);
817 printf(" return 3;\n");
818 } else {
819 printf(" c = %s_2charset[summary->indx + used];\n", name);
820 printf(" r[0] = (c >> 8); r[1] = (c & 0xff);\n");
821 printf(" return 2;\n");
824 printf(" }\n");
825 printf(" }\n");
826 printf(" return RET_ILUNI;\n");
827 printf(" }\n");
828 printf(" return RET_TOOSMALL;\n");
829 printf("}\n");
832 /* ISO-2022/EUC specifics */
834 static int row_byte_normal (int row) { return 0x21+row; }
835 static int col_byte_normal (int col) { return 0x21+col; }
836 static int byte_row_normal (int byte) { return byte-0x21; }
837 static int byte_col_normal (int byte) { return byte-0x21; }
839 static void do_normal (const char* name)
841 Encoding enc;
843 enc.rows = 94;
844 enc.cols = 94;
845 enc.row_byte = row_byte_normal;
846 enc.col_byte = col_byte_normal;
847 enc.byte_row = byte_row_normal;
848 enc.byte_col = byte_col_normal;
849 enc.check_row_expr = "%1$s >= 0x21 && %1$s < 0x7f";
850 enc.check_col_expr = "%1$s >= 0x21 && %1$s < 0x7f";
851 enc.byte_row_expr = "%1$s - 0x21";
852 enc.byte_col_expr = "%1$s - 0x21";
854 read_table(&enc);
855 output_charset2uni(name,&enc);
856 invert(&enc); output_uni2charset_sparse(name,&enc,false);
859 /* Note: On first sight, the jisx0212_2charset[] table seems to be in order,
860 starting from the charset=0x3021/uni=0x4e02 pair. But it's only mostly in
861 order. There are 75 out-of-order values, scattered all throughout the table.
864 static void do_normal_only_charset2uni (const char* name)
866 Encoding enc;
868 enc.rows = 94;
869 enc.cols = 94;
870 enc.row_byte = row_byte_normal;
871 enc.col_byte = col_byte_normal;
872 enc.byte_row = byte_row_normal;
873 enc.byte_col = byte_col_normal;
874 enc.check_row_expr = "%1$s >= 0x21 && %1$s < 0x7f";
875 enc.check_col_expr = "%1$s >= 0x21 && %1$s < 0x7f";
876 enc.byte_row_expr = "%1$s - 0x21";
877 enc.byte_col_expr = "%1$s - 0x21";
879 read_table(&enc);
880 output_charset2uni(name,&enc);
883 /* CNS 11643 specifics - trick to put two tables into one */
885 static int row_byte_cns11643 (int row) {
886 return 0x100 * (row / 94) + (row % 94) + 0x21;
888 static int byte_row_cns11643 (int byte) {
889 return (byte >> 8) * 94 + (byte & 0xff) - 0x21;
892 static void do_cns11643_only_uni2charset (const char* name)
894 Encoding enc;
896 enc.rows = 16*94;
897 enc.cols = 94;
898 enc.row_byte = row_byte_cns11643;
899 enc.col_byte = col_byte_normal;
900 enc.byte_row = byte_row_cns11643;
901 enc.byte_col = byte_col_normal;
902 enc.check_row_expr = "%1$s >= 0x21 && %1$s < 0x7f";
903 enc.check_col_expr = "%1$s >= 0x21 && %1$s < 0x7f";
904 enc.byte_row_expr = "%1$s - 0x21";
905 enc.byte_col_expr = "%1$s - 0x21";
907 read_table(&enc);
908 invert(&enc);
909 output_uni2charset_sparse(name,&enc,false);
912 /* GBK specifics */
914 static int row_byte_gbk1 (int row) {
915 return 0x81+row;
917 static int col_byte_gbk1 (int col) {
918 return (col >= 0x3f ? 0x41 : 0x40) + col;
920 static int byte_row_gbk1 (int byte) {
921 if (byte >= 0x81 && byte < 0xff)
922 return byte-0x81;
923 else
924 return -1;
926 static int byte_col_gbk1 (int byte) {
927 if (byte >= 0x40 && byte < 0x7f)
928 return byte-0x40;
929 else if (byte >= 0x80 && byte < 0xff)
930 return byte-0x41;
931 else
932 return -1;
935 static void do_gbk1 (const char* name)
937 Encoding enc;
939 enc.rows = 126;
940 enc.cols = 190;
941 enc.row_byte = row_byte_gbk1;
942 enc.col_byte = col_byte_gbk1;
943 enc.byte_row = byte_row_gbk1;
944 enc.byte_col = byte_col_gbk1;
945 enc.check_row_expr = "%1$s >= 0x81 && %1$s < 0xff";
946 enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0x80 && %1$s < 0xff)";
947 enc.byte_row_expr = "%1$s - 0x81";
948 enc.byte_col_expr = "%1$s - (%1$s >= 0x80 ? 0x41 : 0x40)";
950 read_table(&enc);
951 output_charset2uni(name,&enc);
952 invert(&enc); output_uni2charset_dense(name,&enc);
955 static void do_gbk1_only_charset2uni (const char* name)
957 Encoding enc;
959 enc.rows = 126;
960 enc.cols = 190;
961 enc.row_byte = row_byte_gbk1;
962 enc.col_byte = col_byte_gbk1;
963 enc.byte_row = byte_row_gbk1;
964 enc.byte_col = byte_col_gbk1;
965 enc.check_row_expr = "%1$s >= 0x81 && %1$s < 0xff";
966 enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0x80 && %1$s < 0xff)";
967 enc.byte_row_expr = "%1$s - 0x81";
968 enc.byte_col_expr = "%1$s - (%1$s >= 0x80 ? 0x41 : 0x40)";
970 read_table(&enc);
971 output_charset2uni(name,&enc);
974 static int row_byte_gbk2 (int row) {
975 return 0x81+row;
977 static int col_byte_gbk2 (int col) {
978 return (col >= 0x3f ? 0x41 : 0x40) + col;
980 static int byte_row_gbk2 (int byte) {
981 if (byte >= 0x81 && byte < 0xff)
982 return byte-0x81;
983 else
984 return -1;
986 static int byte_col_gbk2 (int byte) {
987 if (byte >= 0x40 && byte < 0x7f)
988 return byte-0x40;
989 else if (byte >= 0x80 && byte < 0xa1)
990 return byte-0x41;
991 else
992 return -1;
995 static void do_gbk2_only_charset2uni (const char* name)
997 Encoding enc;
999 enc.rows = 126;
1000 enc.cols = 96;
1001 enc.row_byte = row_byte_gbk2;
1002 enc.col_byte = col_byte_gbk2;
1003 enc.byte_row = byte_row_gbk2;
1004 enc.byte_col = byte_col_gbk2;
1005 enc.check_row_expr = "%1$s >= 0x81 && %1$s < 0xff";
1006 enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0x80 && %1$s < 0xa1)";
1007 enc.byte_row_expr = "%1$s - 0x81";
1008 enc.byte_col_expr = "%1$s - (%1$s >= 0x80 ? 0x41 : 0x40)";
1010 read_table(&enc);
1011 output_charset2uni(name,&enc);
1014 static void do_gbk1_only_uni2charset (const char* name)
1016 Encoding enc;
1018 enc.rows = 126;
1019 enc.cols = 190;
1020 enc.row_byte = row_byte_gbk1;
1021 enc.col_byte = col_byte_gbk1;
1022 enc.byte_row = byte_row_gbk1;
1023 enc.byte_col = byte_col_gbk1;
1024 enc.check_row_expr = "%1$s >= 0x81 && %1$s < 0xff";
1025 enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0x80 && %1$s < 0xff)";
1026 enc.byte_row_expr = "%1$s - 0x81";
1027 enc.byte_col_expr = "%1$s - (%1$s >= 0x80 ? 0x41 : 0x40)";
1029 read_table(&enc);
1030 invert(&enc); output_uni2charset_sparse(name,&enc,false);
1033 /* KSC 5601 specifics */
1036 * Reads the charset2uni table from standard input.
1038 static void read_table_ksc5601 (Encoding* enc)
1040 int row, col, i, i1, i2, c, j;
1042 enc->charset2uni = (int**) malloc(enc->rows*sizeof(int*));
1043 for (row = 0; row < enc->rows; row++)
1044 enc->charset2uni[row] = (int*) malloc(enc->cols*sizeof(int));
1046 for (row = 0; row < enc->rows; row++)
1047 for (col = 0; col < enc->cols; col++)
1048 enc->charset2uni[row][col] = 0xfffd;
1050 c = getc(stdin);
1051 ungetc(c,stdin);
1052 if (c == '#') {
1053 /* Read a unicode.org style .TXT file. */
1054 for (;;) {
1055 c = getc(stdin);
1056 if (c == EOF)
1057 break;
1058 if (c == '\n' || c == ' ' || c == '\t')
1059 continue;
1060 if (c == '#') {
1061 do { c = getc(stdin); } while (!(c == EOF || c == '\n'));
1062 continue;
1064 ungetc(c,stdin);
1065 if (scanf("0x%x", &j) != 1)
1066 exit(1);
1067 i1 = j >> 8;
1068 i2 = j & 0xff;
1069 if (scanf(" 0x%x", &j) != 1)
1070 exit(1);
1071 /* Take only the range covered by KS C 5601.1987-0 = KS C 5601.1989-0
1072 = KS X 1001.1992, ignore the rest. */
1073 if (!(i1 >= 128+33 && i1 < 128+127 && i2 >= 128+33 && i2 < 128+127))
1074 continue; /* KSC5601 specific */
1075 i1 &= 0x7f; /* KSC5601 specific */
1076 i2 &= 0x7f; /* KSC5601 specific */
1077 row = enc->byte_row(i1);
1078 col = enc->byte_col(i2);
1079 if (row < 0 || col < 0) {
1080 fprintf(stderr, "lost entry for %02x %02x\n", i1, i2);
1081 exit(1);
1083 enc->charset2uni[row][col] = j;
1085 } else {
1086 /* Read a table of hexadecimal Unicode values. */
1087 for (i1 = 33; i1 < 127; i1++)
1088 for (i2 = 33; i2 < 127; i2++) {
1089 i = scanf("%x", &j);
1090 if (i == EOF)
1091 goto read_done;
1092 if (i != 1)
1093 exit(1);
1094 if (j < 0 || j == 0xffff)
1095 j = 0xfffd;
1096 if (j != 0xfffd) {
1097 if (enc->byte_row(i1) < 0 || enc->byte_col(i2) < 0) {
1098 fprintf(stderr, "lost entry at %02x %02x\n", i1, i2);
1099 exit (1);
1101 enc->charset2uni[enc->byte_row(i1)][enc->byte_col(i2)] = j;
1104 read_done: ;
1108 static void do_ksc5601 (const char* name)
1110 Encoding enc;
1112 enc.rows = 94;
1113 enc.cols = 94;
1114 enc.row_byte = row_byte_normal;
1115 enc.col_byte = col_byte_normal;
1116 enc.byte_row = byte_row_normal;
1117 enc.byte_col = byte_col_normal;
1118 enc.check_row_expr = "%1$s >= 0x21 && %1$s < 0x7f";
1119 enc.check_col_expr = "%1$s >= 0x21 && %1$s < 0x7f";
1120 enc.byte_row_expr = "%1$s - 0x21";
1121 enc.byte_col_expr = "%1$s - 0x21";
1123 read_table_ksc5601(&enc);
1124 output_charset2uni(name,&enc);
1125 invert(&enc); output_uni2charset_sparse(name,&enc,false);
1128 /* UHC specifics */
1130 /* UHC part 1: 0x{81..A0}{41..5A,61..7A,81..FE} */
1132 static int row_byte_uhc_1 (int row) {
1133 return 0x81 + row;
1135 static int col_byte_uhc_1 (int col) {
1136 return (col >= 0x34 ? 0x4d : col >= 0x1a ? 0x47 : 0x41) + col;
1138 static int byte_row_uhc_1 (int byte) {
1139 if (byte >= 0x81 && byte < 0xa1)
1140 return byte-0x81;
1141 else
1142 return -1;
1144 static int byte_col_uhc_1 (int byte) {
1145 if (byte >= 0x41 && byte < 0x5b)
1146 return byte-0x41;
1147 else if (byte >= 0x61 && byte < 0x7b)
1148 return byte-0x47;
1149 else if (byte >= 0x81 && byte < 0xff)
1150 return byte-0x4d;
1151 else
1152 return -1;
1155 static void do_uhc_1 (const char* name)
1157 Encoding enc;
1159 enc.rows = 32;
1160 enc.cols = 178;
1161 enc.row_byte = row_byte_uhc_1;
1162 enc.col_byte = col_byte_uhc_1;
1163 enc.byte_row = byte_row_uhc_1;
1164 enc.byte_col = byte_col_uhc_1;
1165 enc.check_row_expr = "(%1$s >= 0x81 && %1$s < 0xa1)";
1166 enc.check_col_expr = "(%1$s >= 0x41 && %1$s < 0x5b) || (%1$s >= 0x61 && %1$s < 0x7b) || (%1$s >= 0x81 && %1$s < 0xff)";
1167 enc.byte_row_expr = "%1$s - 0x81";
1168 enc.byte_col_expr = "%1$s - (%1$s >= 0x81 ? 0x4d : %1$s >= 0x61 ? 0x47 : 0x41)";
1170 read_table(&enc);
1171 output_charset2uni_noholes_monotonic(name,&enc);
1172 invert(&enc); output_uni2charset_sparse(name,&enc,true);
1175 /* UHC part 2: 0x{A1..C6}{41..5A,61..7A,81..A0} */
1177 static int row_byte_uhc_2 (int row) {
1178 return 0xa1 + row;
1180 static int col_byte_uhc_2 (int col) {
1181 return (col >= 0x34 ? 0x4d : col >= 0x1a ? 0x47 : 0x41) + col;
1183 static int byte_row_uhc_2 (int byte) {
1184 if (byte >= 0xa1 && byte < 0xff)
1185 return byte-0xa1;
1186 else
1187 return -1;
1189 static int byte_col_uhc_2 (int byte) {
1190 if (byte >= 0x41 && byte < 0x5b)
1191 return byte-0x41;
1192 else if (byte >= 0x61 && byte < 0x7b)
1193 return byte-0x47;
1194 else if (byte >= 0x81 && byte < 0xa1)
1195 return byte-0x4d;
1196 else
1197 return -1;
1200 static void do_uhc_2 (const char* name)
1202 Encoding enc;
1204 enc.rows = 94;
1205 enc.cols = 84;
1206 enc.row_byte = row_byte_uhc_2;
1207 enc.col_byte = col_byte_uhc_2;
1208 enc.byte_row = byte_row_uhc_2;
1209 enc.byte_col = byte_col_uhc_2;
1210 enc.check_row_expr = "(%1$s >= 0xa1 && %1$s < 0xff)";
1211 enc.check_col_expr = "(%1$s >= 0x41 && %1$s < 0x5b) || (%1$s >= 0x61 && %1$s < 0x7b) || (%1$s >= 0x81 && %1$s < 0xa1)";
1212 enc.byte_row_expr = "%1$s - 0xa1";
1213 enc.byte_col_expr = "%1$s - (%1$s >= 0x81 ? 0x4d : %1$s >= 0x61 ? 0x47 : 0x41)";
1215 read_table(&enc);
1216 output_charset2uni_noholes_monotonic(name,&enc);
1217 invert(&enc); output_uni2charset_sparse(name,&enc,true);
1220 /* Big5 specifics */
1222 static int row_byte_big5 (int row) {
1223 return 0xa1+row;
1225 static int col_byte_big5 (int col) {
1226 return (col >= 0x3f ? 0x62 : 0x40) + col;
1228 static int byte_row_big5 (int byte) {
1229 if (byte >= 0xa1 && byte < 0xff)
1230 return byte-0xa1;
1231 else
1232 return -1;
1234 static int byte_col_big5 (int byte) {
1235 if (byte >= 0x40 && byte < 0x7f)
1236 return byte-0x40;
1237 else if (byte >= 0xa1 && byte < 0xff)
1238 return byte-0x62;
1239 else
1240 return -1;
1243 static void do_big5 (const char* name)
1245 Encoding enc;
1247 enc.rows = 94;
1248 enc.cols = 157;
1249 enc.row_byte = row_byte_big5;
1250 enc.col_byte = col_byte_big5;
1251 enc.byte_row = byte_row_big5;
1252 enc.byte_col = byte_col_big5;
1253 enc.check_row_expr = "%1$s >= 0xa1 && %1$s < 0xff";
1254 enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0xa1 && %1$s < 0xff)";
1255 enc.byte_row_expr = "%1$s - 0xa1";
1256 enc.byte_col_expr = "%1$s - (%1$s >= 0xa1 ? 0x62 : 0x40)";
1258 read_table(&enc);
1259 output_charset2uni(name,&enc);
1260 invert(&enc); output_uni2charset_sparse(name,&enc,false);
1263 /* HKSCS specifics */
1265 static int row_byte_hkscs (int row) {
1266 return 0x80+row;
1268 static int byte_row_hkscs (int byte) {
1269 if (byte >= 0x80 && byte < 0xff)
1270 return byte-0x80;
1271 else
1272 return -1;
1275 static void do_hkscs (const char* name)
1277 Encoding enc;
1279 enc.rows = 128;
1280 enc.cols = 157;
1281 enc.row_byte = row_byte_hkscs;
1282 enc.col_byte = col_byte_big5;
1283 enc.byte_row = byte_row_hkscs;
1284 enc.byte_col = byte_col_big5;
1285 enc.check_row_expr = "%1$s >= 0x80 && %1$s < 0xff";
1286 enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0xa1 && %1$s < 0xff)";
1287 enc.byte_row_expr = "%1$s - 0x80";
1288 enc.byte_col_expr = "%1$s - (%1$s >= 0xa1 ? 0x62 : 0x40)";
1290 read_table(&enc);
1291 output_charset2uni(name,&enc);
1292 invert(&enc); output_uni2charset_sparse(name,&enc,false);
1295 /* Johab Hangul specifics */
1297 static int row_byte_johab_hangul (int row) {
1298 return 0x84+row;
1300 static int col_byte_johab_hangul (int col) {
1301 return (col >= 0x3e ? 0x43 : 0x41) + col;
1303 static int byte_row_johab_hangul (int byte) {
1304 if (byte >= 0x84 && byte < 0xd4)
1305 return byte-0x84;
1306 else
1307 return -1;
1309 static int byte_col_johab_hangul (int byte) {
1310 if (byte >= 0x41 && byte < 0x7f)
1311 return byte-0x41;
1312 else if (byte >= 0x81 && byte < 0xff)
1313 return byte-0x43;
1314 else
1315 return -1;
1318 static void do_johab_hangul (const char* name)
1320 Encoding enc;
1322 enc.rows = 80;
1323 enc.cols = 188;
1324 enc.row_byte = row_byte_johab_hangul;
1325 enc.col_byte = col_byte_johab_hangul;
1326 enc.byte_row = byte_row_johab_hangul;
1327 enc.byte_col = byte_col_johab_hangul;
1328 enc.check_row_expr = "%1$s >= 0x84 && %1$s < 0xd4";
1329 enc.check_col_expr = "(%1$s >= 0x41 && %1$s < 0x7f) || (%1$s >= 0x81 && %1$s < 0xff)";
1330 enc.byte_row_expr = "%1$s - 0x84";
1331 enc.byte_col_expr = "%1$s - (%1$s >= 0x81 ? 0x43 : 0x41)";
1333 read_table(&enc);
1334 output_charset2uni(name,&enc);
1335 invert(&enc); output_uni2charset_dense(name,&enc);
1338 /* SJIS specifics */
1340 static int row_byte_sjis (int row) {
1341 return (row >= 0x1f ? 0xc1 : 0x81) + row;
1343 static int col_byte_sjis (int col) {
1344 return (col >= 0x3f ? 0x41 : 0x40) + col;
1346 static int byte_row_sjis (int byte) {
1347 if (byte >= 0x81 && byte < 0xa0)
1348 return byte-0x81;
1349 else if (byte >= 0xe0)
1350 return byte-0xc1;
1351 else
1352 return -1;
1354 static int byte_col_sjis (int byte) {
1355 if (byte >= 0x40 && byte < 0x7f)
1356 return byte-0x40;
1357 else if (byte >= 0x80 && byte < 0xfd)
1358 return byte-0x41;
1359 else
1360 return -1;
1363 static void do_sjis (const char* name)
1365 Encoding enc;
1367 enc.rows = 94;
1368 enc.cols = 188;
1369 enc.row_byte = row_byte_sjis;
1370 enc.col_byte = col_byte_sjis;
1371 enc.byte_row = byte_row_sjis;
1372 enc.byte_col = byte_col_sjis;
1373 enc.check_row_expr = "(%1$s >= 0x81 && %1$s < 0xa0) || (%1$s >= 0xe0)";
1374 enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0x80 && %1$s < 0xfd)";
1375 enc.byte_row_expr = "%1$s - (%1$s >= 0xe0 ? 0xc1 : 0x81)";
1376 enc.byte_col_expr = "%1$s - (%1$s >= 0x80 ? 0x41 : 0x40)";
1378 read_table(&enc);
1379 output_charset2uni(name,&enc);
1380 invert(&enc); output_uni2charset_sparse(name,&enc,false);
1383 /* GB18030 Unicode specifics */
1385 static void do_gb18030uni (const char* name)
1387 int c;
1388 unsigned int bytes;
1389 int i1, i2, i3, i4, i, j, k;
1390 int charset2uni[4*10*126*10];
1391 int uni2charset[0x10000];
1392 struct { int low; int high; int diff; int total; } ranges[256];
1393 int ranges_count, ranges_total;
1395 for (i = 0; i < 4*10*126*10; i++)
1396 charset2uni[i] = 0;
1397 for (j = 0; j < 0x10000; j++)
1398 uni2charset[j] = 0;
1400 /* Read a unicode.org style .TXT file. */
1401 for (;;) {
1402 c = getc(stdin);
1403 if (c == EOF)
1404 break;
1405 if (c == '\n' || c == ' ' || c == '\t')
1406 continue;
1407 if (c == '#') {
1408 do { c = getc(stdin); } while (!(c == EOF || c == '\n'));
1409 continue;
1411 ungetc(c,stdin);
1412 if (scanf("0x%x", &bytes) != 1)
1413 exit(1);
1414 i1 = (bytes >> 24) & 0xff;
1415 i2 = (bytes >> 16) & 0xff;
1416 i3 = (bytes >> 8) & 0xff;
1417 i4 = bytes & 0xff;
1418 if (!(i1 >= 0x81 && i1 <= 0x84
1419 && i2 >= 0x30 && i2 <= 0x39
1420 && i3 >= 0x81 && i3 <= 0xfe
1421 && i4 >= 0x30 && i4 <= 0x39)) {
1422 fprintf(stderr, "lost entry for %02x %02x %02x %02x\n", i1, i2, i3, i4);
1423 exit(1);
1425 i = (((i1-0x81) * 10 + (i2-0x30)) * 126 + (i3-0x81)) * 10 + (i4-0x30);
1426 if (scanf(" 0x%x", &j) != 1)
1427 exit(1);
1428 if (!(j >= 0 && j < 0x10000))
1429 exit(1);
1430 charset2uni[i] = j;
1431 uni2charset[j] = i;
1434 /* Verify that the mapping i -> j is monotonically increasing and
1435 of the form
1436 low[k] <= i <= high[k] => j = diff[k] + i
1437 with a set of disjoint intervals (low[k], high[k]). */
1438 ranges_count = 0;
1439 for (i = 0; i < 4*10*126*10; i++)
1440 if (charset2uni[i] != 0) {
1441 int diff;
1442 j = charset2uni[i];
1443 diff = j - i;
1444 if (ranges_count > 0) {
1445 if (!(i > ranges[ranges_count-1].high))
1446 exit(1);
1447 if (!(j > ranges[ranges_count-1].high + ranges[ranges_count-1].diff))
1448 exit(1);
1449 /* Additional property: The diffs are also increasing. */
1450 if (!(diff >= ranges[ranges_count-1].diff))
1451 exit(1);
1453 if (ranges_count > 0 && diff == ranges[ranges_count-1].diff)
1454 ranges[ranges_count-1].high = i;
1455 else {
1456 if (ranges_count == 256)
1457 exit(1);
1458 ranges[ranges_count].low = i;
1459 ranges[ranges_count].high = i;
1460 ranges[ranges_count].diff = diff;
1461 ranges_count++;
1465 /* Determine size of bitmap. */
1466 ranges_total = 0;
1467 for (k = 0; k < ranges_count; k++) {
1468 ranges[k].total = ranges_total;
1469 ranges_total += ranges[k].high - ranges[k].low + 1;
1472 printf("static const unsigned short %s_charset2uni_ranges[%d] = {\n", name, 2*ranges_count);
1473 for (k = 0; k < ranges_count; k++) {
1474 printf(" 0x%04x, 0x%04x", ranges[k].low, ranges[k].high);
1475 if (k+1 < ranges_count) printf(",");
1476 if ((k % 4) == 3 && k+1 < ranges_count) printf("\n");
1478 printf("\n");
1479 printf("};\n");
1481 printf("\n");
1483 printf("static const unsigned short %s_uni2charset_ranges[%d] = {\n", name, 2*ranges_count);
1484 for (k = 0; k < ranges_count; k++) {
1485 printf(" 0x%04x, 0x%04x", ranges[k].low + ranges[k].diff, ranges[k].high + ranges[k].diff);
1486 if (k+1 < ranges_count) printf(",");
1487 if ((k % 4) == 3 && k+1 < ranges_count) printf("\n");
1489 printf("\n");
1490 printf("};\n");
1492 printf("\n");
1494 printf("static const struct { unsigned short diff; unsigned short bitmap_offset; } %s_ranges[%d] = {\n ", name, ranges_count);
1495 for (k = 0; k < ranges_count; k++) {
1496 printf(" { %5d, 0x%04x }", ranges[k].diff, ranges[k].total);
1497 if (k+1 < ranges_count) printf(",");
1498 if ((k % 4) == 3 && k+1 < ranges_count) printf("\n ");
1500 printf("\n");
1501 printf("};\n");
1503 printf("\n");
1505 printf("static const unsigned char %s_bitmap[%d] = {\n ", name, (ranges_total + 7) / 8);
1507 int accu = 0;
1508 for (k = 0; k < ranges_count; k++) {
1509 for (i = ranges[k].total; i <= ranges[k].total + (ranges[k].high - ranges[k].low);) {
1510 if (charset2uni[i - ranges[k].total + ranges[k].low] != 0)
1511 accu |= (1 << (i % 8));
1512 i++;
1513 if ((i % 8) == 0) {
1514 printf(" 0x%02x", accu);
1515 if ((i / 8) < (ranges_total + 7) / 8) printf(",");
1516 if (((i / 8) % 12) == 0)
1517 printf("\n ");
1518 accu = 0;
1521 if (i != (k+1 < ranges_count ? ranges[k+1].total : ranges_total)) abort();
1523 if ((ranges_total % 8) != 0)
1524 printf(" 0x%02x", accu);
1525 printf("\n");
1527 printf("};\n");
1529 printf("\n");
1531 printf("static int\n");
1532 printf("%s_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, int n)\n", name);
1533 printf("{\n");
1534 printf(" unsigned char c1 = s[0];\n");
1535 printf(" if (c1 >= 0x81 && c1 <= 0x84) {\n");
1536 printf(" if (n >= 2) {\n");
1537 printf(" unsigned char c2 = s[1];\n");
1538 printf(" if (c2 >= 0x30 && c2 <= 0x39) {\n");
1539 printf(" if (n >= 3) {\n");
1540 printf(" unsigned char c3 = s[2];\n");
1541 printf(" if (c3 >= 0x81 && c3 <= 0xfe) {\n");
1542 printf(" if (n >= 4) {\n");
1543 printf(" unsigned char c4 = s[3];\n");
1544 printf(" if (c4 >= 0x30 && c4 <= 0x39) {\n");
1545 printf(" unsigned int i = (((c1 - 0x81) * 10 + (c2 - 0x30)) * 126 + (c3 - 0x81)) * 10 + (c4 - 0x30);\n");
1546 printf(" if (i >= %d && i <= %d) {\n", ranges[0].low, ranges[ranges_count-1].high);
1547 printf(" unsigned int k1 = 0;\n");
1548 printf(" unsigned int k2 = %d;\n", ranges_count-1);
1549 printf(" while (k1 < k2) {\n");
1550 printf(" unsigned int k = (k1 + k2) / 2;\n");
1551 printf(" if (i <= %s_charset2uni_ranges[2*k+1])\n", name);
1552 printf(" k2 = k;\n");
1553 printf(" else if (i >= %s_charset2uni_ranges[2*k+2])\n", name);
1554 printf(" k1 = k + 1;\n");
1555 printf(" else\n");
1556 printf(" return RET_ILSEQ;\n");
1557 printf(" }\n");
1558 printf(" {\n");
1559 printf(" unsigned int bitmap_index = i - %s_charset2uni_ranges[2*k1] + %s_ranges[k1].bitmap_offset;\n", name, name);
1560 printf(" if ((%s_bitmap[bitmap_index >> 3] >> (bitmap_index & 7)) & 1) {\n", name);
1561 printf(" unsigned int diff = %s_ranges[k1].diff;\n", name);
1562 printf(" *pwc = (ucs4_t) (i + diff);\n");
1563 printf(" return 4;\n");
1564 printf(" }\n");
1565 printf(" }\n");
1566 printf(" }\n");
1567 printf(" }\n");
1568 printf(" return RET_ILSEQ;\n");
1569 printf(" }\n");
1570 printf(" return RET_TOOFEW(0);\n");
1571 printf(" }\n");
1572 printf(" return RET_ILSEQ;\n");
1573 printf(" }\n");
1574 printf(" return RET_TOOFEW(0);\n");
1575 printf(" }\n");
1576 printf(" return RET_ILSEQ;\n");
1577 printf(" }\n");
1578 printf(" return RET_TOOFEW(0);\n");
1579 printf(" }\n");
1580 printf(" return RET_ILSEQ;\n");
1581 printf("}\n");
1583 printf("\n");
1585 printf("static int\n");
1586 printf("%s_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, int n)\n", name);
1587 printf("{\n");
1588 printf(" if (n >= 4) {\n");
1589 printf(" unsigned int i = wc;\n");
1590 printf(" if (i >= 0x%04x && i <= 0x%04x) {\n", ranges[0].low + ranges[0].diff, ranges[ranges_count-1].high + ranges[ranges_count-1].diff);
1591 printf(" unsigned int k1 = 0;\n");
1592 printf(" unsigned int k2 = %d;\n", ranges_count-1);
1593 printf(" while (k1 < k2) {\n");
1594 printf(" unsigned int k = (k1 + k2) / 2;\n");
1595 printf(" if (i <= %s_uni2charset_ranges[2*k+1])\n", name);
1596 printf(" k2 = k;\n");
1597 printf(" else if (i >= %s_uni2charset_ranges[2*k+2])\n", name);
1598 printf(" k1 = k + 1;\n");
1599 printf(" else\n");
1600 printf(" return RET_ILUNI;\n");
1601 printf(" }\n");
1602 printf(" {\n");
1603 printf(" unsigned int bitmap_index = i - %s_uni2charset_ranges[2*k1] + %s_ranges[k1].bitmap_offset;\n", name, name);
1604 printf(" if ((%s_bitmap[bitmap_index >> 3] >> (bitmap_index & 7)) & 1) {\n", name);
1605 printf(" unsigned int diff = %s_ranges[k1].diff;\n", name);
1606 printf(" i -= diff;\n");
1607 printf(" r[3] = (i %% 10) + 0x30; i = i / 10;\n");
1608 printf(" r[2] = (i %% 126) + 0x81; i = i / 126;\n");
1609 printf(" r[1] = (i %% 10) + 0x30; i = i / 10;\n");
1610 printf(" r[0] = i + 0x81;\n");
1611 printf(" return 4;\n");
1612 printf(" }\n");
1613 printf(" }\n");
1614 printf(" }\n");
1615 printf(" return RET_ILUNI;\n");
1616 printf(" }\n");
1617 printf(" return RET_TOOSMALL;\n");
1618 printf("}\n");
1621 /* Main program */
1623 int main (int argc, char *argv[])
1625 const char* charsetname;
1626 const char* name;
1628 if (argc != 3)
1629 exit(1);
1630 charsetname = argv[1];
1631 name = argv[2];
1633 output_title(charsetname);
1635 if (!strcmp(name,"gb2312")
1636 || !strcmp(name,"isoir165ext") || !strcmp(name,"gb12345ext")
1637 || !strcmp(name,"jisx0208") || !strcmp(name,"jisx0212"))
1638 do_normal(name);
1639 else if (!strcmp(name,"cns11643_1") || !strcmp(name,"cns11643_2")
1640 || !strcmp(name,"cns11643_3") || !strcmp(name,"cns11643_4a")
1641 || !strcmp(name,"cns11643_4b") || !strcmp(name,"cns11643_5")
1642 || !strcmp(name,"cns11643_6") || !strcmp(name,"cns11643_7")
1643 || !strcmp(name,"cns11643_15"))
1644 do_normal_only_charset2uni(name);
1645 else if (!strcmp(name,"cns11643_inv"))
1646 do_cns11643_only_uni2charset(name);
1647 else if (!strcmp(name,"gbkext1"))
1648 do_gbk1_only_charset2uni(name);
1649 else if (!strcmp(name,"gbkext2"))
1650 do_gbk2_only_charset2uni(name);
1651 else if (!strcmp(name,"gbkext_inv"))
1652 do_gbk1_only_uni2charset(name);
1653 else if (!strcmp(name,"cp936ext") || !strcmp(name,"gb18030ext"))
1654 do_gbk1(name);
1655 else if (!strcmp(name,"ksc5601"))
1656 do_ksc5601(name);
1657 else if (!strcmp(name,"uhc_1"))
1658 do_uhc_1(name);
1659 else if (!strcmp(name,"uhc_2"))
1660 do_uhc_2(name);
1661 else if (!strcmp(name,"big5") || !strcmp(name,"cp950ext"))
1662 do_big5(name);
1663 else if (!strcmp(name,"hkscs"))
1664 do_hkscs(name);
1665 else if (!strcmp(name,"johab_hangul"))
1666 do_johab_hangul(name);
1667 else if (!strcmp(name,"cp932ext"))
1668 do_sjis(name);
1669 else if (!strcmp(name,"gb18030uni"))
1670 do_gb18030uni(name);
1671 else
1672 exit(1);
1674 return 0;