1 /* Copyright (C) 1999-2002, 2011-2012, 2016, 2018, 2022 Free Software Foundation, Inc.
2 This file is part of the GNU LIBICONV Tools.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, see <https://www.gnu.org/licenses/>. */
18 * Generates an 8-bit character set table from a .TXT table as found on
19 * ftp.unicode.org or from a table containing the 256 Unicode values as
20 * hexadecimal integers.
23 * ./8bit_tab_to_h ISO-8859-1 iso8859_1 < tab8859_1
24 * ./8bit_tab_to_h ISO-8859-2 iso8859_2 < tab8859_2
25 * ./8bit_tab_to_h ISO-8859-3 iso8859_3 < tab8859_3
26 * ./8bit_tab_to_h ISO-8859-4 iso8859_4 < tab8859_4
27 * ./8bit_tab_to_h ISO-8859-5 iso8859_5 < tab8859_5
28 * ./8bit_tab_to_h ISO-8859-6 iso8859_6 < tab8859_6
29 * ./8bit_tab_to_h ISO-8859-7 iso8859_7 < tab8859_7
30 * ./8bit_tab_to_h ISO-8859-8 iso8859_8 < tab8859_8
31 * ./8bit_tab_to_h ISO-8859-9 iso8859_9 < tab8859_9
32 * ./8bit_tab_to_h ISO-8859-10 iso8859_10 < tab8859_10
33 * ./8bit_tab_to_h ISO-8859-14 iso8859_14 < tab8859_14
34 * ./8bit_tab_to_h ISO-8859-15 iso8859_15 < tab8859_15
35 * ./8bit_tab_to_h JISX0201.1976-0 jisx0201 < jis0201
36 * ./8bit_tab_to_h TIS620.2533-1 tis620 < tabtis620
37 * ./8bit_tab_to_h KOI8-R koi8_r < tabkoi8_r
38 * ./8bit_tab_to_h KOI8-U koi8_u < tabkoi8_u
39 * ./8bit_tab_to_h ARMSCII-8 armscii_8 < tabarmscii_8
40 * ./8bit_tab_to_h CP1133 cp1133 < tabibm_cp1133
41 * ./8bit_tab_to_h MULELAO-1 mulelao < tabmulelao_1
42 * ./8bit_tab_to_h VISCII1.1-1 viscii1 < tabviscii
43 * ./8bit_tab_to_h TCVN-5712 tcvn < tabtcvn
44 * ./8bit_tab_to_h GEORGIAN-ACADEMY georgian_ac < tabgeorgian_academy
45 * ./8bit_tab_to_h GEORGIAN-PS georgian_ps < tabgeorgian_ps
47 * ./8bit_tab_to_h ISO-8859-1 iso8859_1 < 8859-1.TXT
48 * ./8bit_tab_to_h ISO-8859-2 iso8859_2 < 8859-2.TXT
49 * ./8bit_tab_to_h ISO-8859-3 iso8859_3 < 8859-3.TXT
50 * ./8bit_tab_to_h ISO-8859-4 iso8859_4 < 8859-4.TXT
51 * ./8bit_tab_to_h ISO-8859-5 iso8859_5 < 8859-5.TXT
52 * ./8bit_tab_to_h ISO-8859-6 iso8859_6 < 8859-6.TXT
53 * ./8bit_tab_to_h ISO-8859-7 iso8859_7 < 8859-7.TXT
54 * ./8bit_tab_to_h ISO-8859-8 iso8859_8 < 8859-8.TXT
55 * ./8bit_tab_to_h ISO-8859-9 iso8859_9 < 8859-9.TXT
56 * ./8bit_tab_to_h ISO-8859-10 iso8859_10 < 8859-10.TXT
57 * ./8bit_tab_to_h ISO-8859-14 iso8859_14 < 8859-14.TXT
58 * ./8bit_tab_to_h ISO-8859-15 iso8859_15 < 8859-15.TXT
59 * ./8bit_tab_to_h JISX0201.1976-0 jisx0201 < JIS0201.TXT
60 * ./8bit_tab_to_h KOI8-R koi8_r < KOI8-R.TXT
62 * ./8bit_tab_to_h 'CP50221 JISX0208 extensions' cp50221_0208_ext < CP50221-0208-EXT.TXT
63 * ./8bit_tab_to_h 'CP50221 JISX0212 extensions' cp50221_0212_ext < CP50221-0212-EXT.TXT
71 int main (int argc
, char *argv
[])
73 const char* charsetname
;
74 const char* c_charsetname
;
76 const char* directory
;
77 int charset2uni
[0x100];
79 if (argc
!= 3 && argc
!= 4 && argc
!= 5)
81 charsetname
= argv
[1];
82 c_charsetname
= argv
[2];
86 char* s
= (char*) malloc(strlen(c_charsetname
)+strlen(".h")+1);
87 strcpy(s
,c_charsetname
); strcat(s
,".h");
90 directory
= (argc
> 4 ? argv
[4] : "");
92 fprintf(stderr
, "Creating %s%s\n", directory
, filename
);
99 /* Read a unicode.org style .TXT file. */
100 for (i
= 0; i
< 0x100; i
++)
101 charset2uni
[i
] = 0xfffd;
106 if (c
== '\n' || c
== ' ' || c
== '\t')
109 do { c
= getc(stdin
); } while (!(c
== EOF
|| c
== '\n'));
113 if (scanf("0x%x", &i
) != 1 || !(i
>= 0 && i
< 0x100))
115 do { c
= getc(stdin
); } while (c
== ' ' || c
== '\t');
118 if (c
== '\n' || c
== '#')
120 if (scanf("0x%x", &charset2uni
[i
]) != 1)
124 /* Read a table of hexadecimal Unicode values. */
125 for (i
= 0; i
< 0x100; i
++) {
126 if (scanf("%x", &charset2uni
[i
]) != 1)
128 if (charset2uni
[i
] < 0 || charset2uni
[i
] == 0xffff)
129 charset2uni
[i
] = 0xfffd;
131 if (scanf("%x", &i
) != EOF
)
136 /* Write the output file. */
141 char* fname
= malloc(strlen(directory
)+strlen(filename
)+1);
142 strcpy(fname
,directory
); strcat(fname
,filename
);
143 f
= fopen(fname
,"w");
149 fprintf(f
, " * Copyright (C) 1999-2022 Free Software Foundation, Inc.\n");
150 fprintf(f
, " * This file is part of the GNU LIBICONV Library.\n");
152 fprintf(f
, " * The GNU LIBICONV Library is free software; you can redistribute it\n");
153 fprintf(f
, " * and/or modify it under the terms of the GNU Lesser General Public\n");
154 fprintf(f
, " * License as published by the Free Software Foundation; either version 2\n");
155 fprintf(f
, " * of the License, or (at your option) any later version.\n");
157 fprintf(f
, " * The GNU LIBICONV Library is distributed in the hope that it will be\n");
158 fprintf(f
, " * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
159 fprintf(f
, " * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU\n");
160 fprintf(f
, " * Lesser General Public License for more details.\n");
162 fprintf(f
, " * You should have received a copy of the GNU Lesser General Public\n");
163 fprintf(f
, " * License along with the GNU LIBICONV Library; see the file COPYING.LIB.\n");
164 fprintf(f
, " * If not, see <https://www.gnu.org/licenses/>.\n");
168 fprintf(f
, " * %s\n", charsetname
);
176 struct { int minline
; int maxline
; } tables
[16];
178 bool final_ret_reached
;
180 for (i1
= 0; i1
< 16; i1
++) {
181 bool all_invalid
= true;
182 bool all_identity
= true;
183 for (i2
= 0; i2
< 16; i2
++) {
185 if (charset2uni
[i
] != 0xfffd)
187 if (charset2uni
[i
] != i
)
188 all_identity
= false;
192 else if (all_identity
)
198 for (i1
= 0; i1
< 16; i1
++) {
200 if (i1
> 0 && tableno
> 0 && line
[i1
-1] == tableno
-1) {
201 line
[i1
] = tableno
-1;
202 tables
[tableno
-1].maxline
= i1
;
205 line
[i1
] = tableno
-1;
206 tables
[tableno
-1].minline
= tables
[tableno
-1].maxline
= i1
;
210 some_invalid
= false;
211 for (i
= 0; i
< 0x100; i
++)
212 if (charset2uni
[i
] == 0xfffd)
216 for (t
= 0; t
< tableno
; t
++) {
217 fprintf(f
, "static const unsigned short %s_2uni", c_charsetname
);
219 fprintf(f
, "_%d", t
+1);
220 fprintf(f
, "[%d] = {\n", 16*(tables
[t
].maxline
-tables
[t
].minline
+1));
221 for (i1
= tables
[t
].minline
; i1
<= tables
[t
].maxline
; i1
++) {
222 fprintf(f
, " /* 0x%02x */\n", 16*i1
);
223 for (i2
= 0; i2
< 2; i2
++) {
225 for (i3
= 0; i3
< 8; i3
++) {
227 fprintf(f
, " 0x%04x,", charset2uni
[i
]);
236 final_ret_reached
= false;
237 fprintf(f
, "static int\n%s_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, size_t n)\n", c_charsetname
);
239 fprintf(f
, " unsigned char c = *s;\n");
241 for (i1
= 0; i1
< 16;) {
244 for (i2
= i1
; i2
< 16 && line
[i2
] == t
; i2
++);
245 indent
= (i1
== 0 && i2
== 16 ? " " : " ");
249 fprintf(f
, " if (c < 0x%02x) {\n", 16*i2
);
253 fprintf(f
, " else {\n");
255 fprintf(f
, " else if (c < 0x%02x) {\n", 16*i2
);
259 final_ret_reached
= true;
260 } else if (t
== -1) {
261 fprintf(f
, "%s*pwc = (ucs4_t) c;\n", indent
);
262 fprintf(f
, "%sreturn 1;\n", indent
);
264 fprintf(f
, "%s", indent
);
265 some_invalid
= false;
266 for (i
= 16*i1
; i
< 16*i2
; i
++)
267 if (charset2uni
[i
] == 0xfffd)
270 fprintf(f
, "unsigned short wc = ");
272 fprintf(f
, "*pwc = (ucs4_t) ");
273 fprintf(f
, "%s_2uni", c_charsetname
);
275 fprintf(f
, "_%d", t
+1);
277 if (tables
[t
].minline
> 0)
278 fprintf(f
, "-0x%02x", 16*tables
[t
].minline
);
281 fprintf(f
, "%sif (wc != 0xfffd) {\n", indent
);
282 fprintf(f
, "%s *pwc = (ucs4_t) wc;\n", indent
);
283 fprintf(f
, "%s return 1;\n", indent
);
284 fprintf(f
, "%s}\n", indent
);
285 final_ret_reached
= true;
287 fprintf(f
, "%sreturn 1;\n", indent
);
290 if (!(i1
== 0 && i2
== 16))
294 if (final_ret_reached
)
295 fprintf(f
, " return RET_ILSEQ;\n");
297 for (i1
= 0; i1
< 16;) {
299 for (i2
= i1
; i2
< 16 && line
[i2
] == t
; i2
++);
304 fprintf(f
, " if (c < 0x%02x)\n ", 16*i2
);
308 fprintf(f
, " else\n ");
310 fprintf(f
, " else if (c < 0x%02x)\n ", 16*i2
);
314 fprintf(f
, "*pwc = (ucs4_t) c;\n");
316 fprintf(f
, "*pwc = (ucs4_t) %s_2uni", c_charsetname
);
318 fprintf(f
, "_%d", t
+1);
320 if (tables
[t
].minline
> 0)
321 fprintf(f
, "-0x%02x", 16*tables
[t
].minline
);
326 fprintf(f
, " return 1;\n");
335 int uni2charset
[0x10000];
339 struct { int minline
; int maxline
; int usecount
; const char* suffix
; } tables
[0x2000];
342 int i
, j
, p
, j1
, j2
, t
;
344 for (j
= 0; j
< 0x10000; j
++)
346 for (p
= 0; p
< 0x100; p
++)
348 for (i
= 0; i
< 0x100; i
++) {
355 for (j1
= 0; j1
< 0x2000; j1
++) {
356 bool all_invalid
= true;
357 bool all_identity
= true;
358 for (j2
= 0; j2
< 8; j2
++) {
360 if (uni2charset
[j
] != 0)
362 if (uni2charset
[j
] != j
)
363 all_identity
= false;
367 else if (all_identity
)
373 for (j1
= 0; j1
< 0x2000; j1
++) {
376 && ((j1
> 0 && line
[j1
-1] == tableno
-1)
377 || ((tables
[tableno
-1].maxline
>> 5) == (j1
>> 5)
378 && j1
- tables
[tableno
-1].maxline
<= 8))) {
379 line
[j1
] = tableno
-1;
380 tables
[tableno
-1].maxline
= j1
;
383 line
[j1
] = tableno
-1;
384 tables
[tableno
-1].minline
= tables
[tableno
-1].maxline
= j1
;
388 for (t
= 0; t
< tableno
; t
++) {
389 tables
[t
].usecount
= 0;
390 j1
= 8*tables
[t
].minline
;
391 j2
= 8*(tables
[t
].maxline
+1);
392 for (j
= j1
; j
< j2
; j
++)
393 if (uni2charset
[j
] != 0)
394 tables
[t
].usecount
++;
396 for (t
= 0, p
= -1, i
= 0; t
< tableno
; t
++) {
397 if (tables
[t
].usecount
> 1) {
399 if (p
== tables
[t
].minline
>> 5) {
400 s
= (char*) malloc(5+1);
401 sprintf(s
, "%02x_%d", p
, ++i
);
403 p
= tables
[t
].minline
>> 5;
404 s
= (char*) malloc(2+1);
405 sprintf(s
, "%02x", p
);
407 tables
[t
].suffix
= s
;
409 tables
[t
].suffix
= NULL
;
413 for (t
= 0; t
< tableno
; t
++)
414 if (tables
[t
].usecount
> 1) {
416 fprintf(f
, "static const unsigned char %s_page%s[%d] = {\n", c_charsetname
, tables
[t
].suffix
, 8*(tables
[t
].maxline
-tables
[t
].minline
+1));
417 for (j1
= tables
[t
].minline
; j1
<= tables
[t
].maxline
; j1
++) {
418 if ((j1
% 0x20) == 0 && j1
> tables
[t
].minline
)
419 fprintf(f
, " /* 0x%04x */\n", 8*j1
);
421 for (j2
= 0; j2
< 8; j2
++) {
423 fprintf(f
, " 0x%02x,", uni2charset
[j
]);
425 fprintf(f
, " /* 0x%02x-0x%02x */\n", 8*(j1
% 0x20), 8*(j1
% 0x20)+7);
433 for (j1
= 0; j1
< 0x2000;) {
435 for (j2
= j1
; j2
< 0x2000 && line
[j2
] == t
; j2
++);
437 j2
= tables
[t
].maxline
+1;
438 if (!(t
== -2 || (t
== -1 && j1
== 0)))
443 fprintf(f
, "static int\n%s_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, size_t n)\n", c_charsetname
);
446 fprintf(f
, " unsigned char c = 0;\n");
447 for (j1
= 0; j1
< 0x2000;) {
449 for (j2
= j1
; j2
< 0x2000 && line
[j2
] == t
; j2
++);
451 if (j1
!= tables
[t
].minline
) abort();
452 if (j2
> tables
[t
].maxline
+1) abort();
453 j2
= tables
[t
].maxline
+1;
460 fprintf(f
, " else ");
461 if (t
>= 0 && tables
[t
].usecount
== 0) abort();
462 if (t
>= 0 && tables
[t
].usecount
== 1) {
463 if (j2
!= j1
+1) abort();
464 for (j
= 8*j1
; j
< 8*j2
; j
++)
465 if (uni2charset
[j
] != 0) {
466 fprintf(f
, "if (wc == 0x%04x)\n c = 0x%02x;\n", j
, uni2charset
[j
]);
471 fprintf(f
, "if (wc < 0x%04x)", 8*j2
);
473 fprintf(f
, "if (wc >= 0x%04x && wc < 0x%04x)", 8*j1
, 8*j2
);
477 /* If wc == 0, the function must return 1, not -1. */
478 fprintf(f
, " {\n *r = wc;\n return 1;\n }\n");
480 fprintf(f
, "\n c = wc;\n");
482 fprintf(f
, "\n c = %s_page%s[wc", c_charsetname
, tables
[t
].suffix
);
483 if (tables
[t
].minline
> 0)
484 fprintf(f
, "-0x%04x", 8*j1
);
486 if (j1
== 0 && uni2charset
[0] == 0)
487 /* If wc == 0, the function must return 1, not -1. */
496 fprintf(f
, " if (c != 0 || wc == 0) {\n");
498 fprintf(f
, " if (c != 0) {\n");
499 fprintf(f
, " *r = c;\n");
500 fprintf(f
, " return 1;\n");
503 fprintf(f
, " return RET_ILUNI;\n");
508 if (ferror(f
) || fclose(f
))
514 int i1
, i2
, i3
, i1_min
, i1_max
, j1
, j2
;
518 for (i1
= 0; i1
< 16; i1
++)
519 for (i2
= 0; i2
< 16; i2
++)
520 if (charset2uni
[16*i1
+i2
] != 0xfffd) {
521 if (i1_min
> i1
) i1_min
= i1
;
522 if (i1_max
< i1
) i1_max
= i1
;
524 printf("static const unsigned short %s_2uni[%d] = {\n",
525 name
, 16*(i1_max
-i1_min
+1));
526 for (i1
= i1_min
; i1
<= i1_max
; i1
++) {
527 printf(" /""* 0x%02x *""/\n", 16*i1
);
528 for (i2
= 0; i2
< 2; i2
++) {
530 for (i3
= 0; i3
< 8; i3
++) {
531 if (i3
> 0) printf(" ");
532 printf("0x%04x,", charset2uni
[16*i1
+8*i2
+i3
]);
540 for (p
= 0; p
< 0x100; p
++)
542 for (i
= 0; i
< 0x100; i
++)
543 if (charset2uni
[i
] != 0xfffd)
544 pages
[charset2uni
[i
]>>8] = 1;
545 for (p
= 0; p
< 0x100; p
++)
549 for (j1
= 0; j1
< 32; j1
++)
550 for (j2
= 0; j2
< 8; j2
++)
551 if (uni2charset
[256*p
+8*j1
+j2
] != 0) {
552 if (j1_min
> j1
) j1_min
= j1
;
553 if (j1_max
< j1
) j1_max
= j1
;
555 printf("static const unsigned char %s_page%02x[%d] = {\n",
556 name
, p
, 8*(j1_max
-j1_min
+1));
557 for (j1
= j1_min
; j1
<= j1_max
; j1
++) {
559 for (j2
= 0; j2
< 8; j2
++)
560 printf("0x%02x, ", uni2charset
[256*p
+8*j1
+j2
]);
561 printf("/""* 0x%02x-0x%02x *""/\n", 8*j1
, 8*j1
+7);