CP1258 handles combining characters.
[libiconv.git] / lib / gentranslit.c
bloba53ad7496c483bebf81f581132dbf283dd0f2a42
1 /* Copyright (C) 1999-2001 Free Software Foundation, Inc.
2 This file is part of the GNU LIBICONV Library.
4 The GNU LIBICONV Library is free software; you can redistribute it
5 and/or modify it under the terms of the GNU Library General Public
6 License as published by the Free Software Foundation; either version 2
7 of the License, or (at your option) any later version.
9 The GNU LIBICONV Library is distributed in the hope that it will be
10 useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 Library General Public License for more details.
14 You should have received a copy of the GNU Library General Public
15 License along with the GNU LIBICONV Library; see the file COPYING.LIB.
16 If not, write to the Free Software Foundation, Inc., 59 Temple Place -
17 Suite 330, Boston, MA 02111-1307, USA. */
20 * Generates a table of small strings, used for transliteration, from a table
21 * containing lines of the form
22 * Unicode <tab> utf-8 replacement <tab> # comment
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <stdbool.h>
29 int main (int argc, char *argv[])
31 unsigned short data[0x100000];
32 int uni2index[0x10000];
33 int index;
35 if (argc != 1)
36 exit(1);
38 printf("/*\n");
39 printf(" * Copyright (C) 1999-2001 Free Software Foundation, Inc.\n");
40 printf(" * This file is part of the GNU LIBICONV Library.\n");
41 printf(" *\n");
42 printf(" * The GNU LIBICONV Library is free software; you can redistribute it\n");
43 printf(" * and/or modify it under the terms of the GNU Library General Public\n");
44 printf(" * License as published by the Free Software Foundation; either version 2\n");
45 printf(" * of the License, or (at your option) any later version.\n");
46 printf(" *\n");
47 printf(" * The GNU LIBICONV Library is distributed in the hope that it will be\n");
48 printf(" * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
49 printf(" * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU\n");
50 printf(" * Library General Public License for more details.\n");
51 printf(" *\n");
52 printf(" * You should have received a copy of the GNU Library General Public\n");
53 printf(" * License along with the GNU LIBICONV Library; see the file COPYING.LIB.\n");
54 printf(" * If not, write to the Free Software Foundation, Inc., 59 Temple Place -\n");
55 printf(" * Suite 330, Boston, MA 02111-1307, USA.\n");
56 printf(" */\n");
57 printf("\n");
58 printf("/*\n");
59 printf(" * Transliteration table\n");
60 printf(" */\n");
61 printf("\n");
63 int c;
64 int j;
65 for (j = 0; j < 0x10000; j++)
66 uni2index[j] = -1;
67 index = 0;
68 for (;;) {
69 c = getc(stdin);
70 if (c == EOF)
71 break;
72 if (c == '#') {
73 do { c = getc(stdin); } while (!(c == EOF || c == '\n'));
74 continue;
76 ungetc(c,stdin);
77 if (scanf("%x",&j) != 1)
78 exit(1);
79 c = getc(stdin);
80 if (c != '\t')
81 exit(1);
82 for (;;) {
83 c = getc(stdin);
84 if (c == EOF || c == '\n')
85 exit(1);
86 if (c == '\t')
87 break;
88 if (uni2index[j] < 0) {
89 uni2index[j] = index;
90 data[index++] = 0;
92 if (c >= 0x80) {
93 /* Finish reading an UTF-8 character. */
94 if (c < 0xc0)
95 exit(1);
96 else {
97 unsigned int i = (c < 0xe0 ? 2 : c < 0xf0 ? 3 : c < 0xf8 ? 4 : c < 0xfc ? 5 : 6);
98 c &= (1 << (8-i)) - 1;
99 while (--i > 0) {
100 int cc = getc(stdin);
101 if (!(cc >= 0x80 && cc < 0xc0))
102 exit(1);
103 c <<= 6; c |= (cc & 0x3f);
107 data[index++] = (unsigned short) c;
109 if (uni2index[j] >= 0)
110 data[uni2index[j]] = index - uni2index[j] - 1;
111 do { c = getc(stdin); } while (!(c == EOF || c == '\n'));
114 printf("static const unsigned short translit_data[%d] = {",index);
116 int i;
117 for (i = 0; i < index; i++) {
118 if (data[i] < 32)
119 printf("\n %3d,",data[i]);
120 else if (data[i] == '\'')
121 printf("'\\'',");
122 else if (data[i] == '\\')
123 printf("'\\\\',");
124 else if (data[i] < 127)
125 printf(" '%c',",data[i]);
126 else if (data[i] < 256)
127 printf("0x%02X,",data[i]);
128 else
129 printf("0x%04X,",data[i]);
131 printf("\n};\n");
133 printf("\n");
135 bool pages[0x100];
136 int line[0x2000];
137 int tableno;
138 struct { int minline; int maxline; int usecount; const char* suffix; } tables[0x2000];
139 int i, j, p, j1, j2, t;
141 for (p = 0; p < 0x100; p++)
142 pages[p] = false;
143 for (j = 0; j < 0x10000; j++)
144 if (uni2index[j] >= 0)
145 pages[j>>8] = true;
146 for (j1 = 0; j1 < 0x2000; j1++) {
147 bool all_invalid = true;
148 for (j2 = 0; j2 < 8; j2++) {
149 j = 8*j1+j2;
150 if (uni2index[j] >= 0)
151 all_invalid = false;
153 if (all_invalid)
154 line[j1] = -1;
155 else
156 line[j1] = 0;
158 tableno = 0;
159 for (j1 = 0; j1 < 0x2000; j1++) {
160 if (line[j1] >= 0) {
161 if (tableno > 0
162 && ((j1 > 0 && line[j1-1] == tableno-1)
163 || ((tables[tableno-1].maxline >> 5) == (j1 >> 5)
164 && j1 - tables[tableno-1].maxline <= 8))) {
165 line[j1] = tableno-1;
166 tables[tableno-1].maxline = j1;
167 } else {
168 tableno++;
169 line[j1] = tableno-1;
170 tables[tableno-1].minline = tables[tableno-1].maxline = j1;
174 for (t = 0; t < tableno; t++) {
175 tables[t].usecount = 0;
176 j1 = 8*tables[t].minline;
177 j2 = 8*(tables[t].maxline+1);
178 for (j = j1; j < j2; j++)
179 if (uni2index[j] >= 0)
180 tables[t].usecount++;
182 for (t = 0, p = -1, i = 0; t < tableno; t++) {
183 if (tables[t].usecount > 1) {
184 char* s;
185 if (p == tables[t].minline >> 5) {
186 s = (char*) malloc(5+1);
187 sprintf(s, "%02x_%d", p, ++i);
188 } else {
189 p = tables[t].minline >> 5;
190 s = (char*) malloc(2+1);
191 sprintf(s, "%02x", p);
193 tables[t].suffix = s;
194 } else
195 tables[t].suffix = NULL;
198 p = -1;
199 for (t = 0; t < tableno; t++)
200 if (tables[t].usecount > 1) {
201 p = 0;
202 printf("static const short translit_page%s[%d] = {\n", tables[t].suffix, 8*(tables[t].maxline-tables[t].minline+1));
203 for (j1 = tables[t].minline; j1 <= tables[t].maxline; j1++) {
204 if ((j1 % 0x20) == 0 && j1 > tables[t].minline)
205 printf(" /* 0x%04x */\n", 8*j1);
206 printf(" ");
207 for (j2 = 0; j2 < 8; j2++) {
208 j = 8*j1+j2;
209 printf(" %4d,", uni2index[j]);
211 printf(" /* 0x%02x-0x%02x */\n", 8*(j1 % 0x20), 8*(j1 % 0x20)+7);
213 printf("};\n");
215 if (p >= 0)
216 printf("\n");
218 printf("#define translit_index(wc) \\\n (");
219 for (j1 = 0; j1 < 0x2000;) {
220 t = line[j1];
221 for (j2 = j1; j2 < 0x2000 && line[j2] == t; j2++);
222 if (t >= 0) {
223 if (j1 != tables[t].minline) abort();
224 if (j2 > tables[t].maxline+1) abort();
225 j2 = tables[t].maxline+1;
227 if (t == -1) {
228 } else {
229 if (t >= 0 && tables[t].usecount == 0) abort();
230 if (t >= 0 && tables[t].usecount == 1) {
231 if (j2 != j1+1) abort();
232 for (j = 8*j1; j < 8*j2; j++)
233 if (uni2index[j] >= 0) {
234 printf("wc == 0x%04x ? %d", j, uni2index[j]);
235 break;
237 } else {
238 if (j1 == 0) {
239 printf("wc < 0x%04x", 8*j2);
240 } else {
241 printf("wc >= 0x%04x && wc < 0x%04x", 8*j1, 8*j2);
243 printf(" ? translit_page%s[wc", tables[t].suffix);
244 if (tables[t].minline > 0)
245 printf("-0x%04x", 8*j1);
246 printf("]");
248 printf(" : \\\n ");
250 j1 = j2;
252 printf("-1)\n");
255 fflush(stdout);
256 if (ferror(stdout))
257 exit(1);
258 exit(0);