update dev300-m58
[ooovba.git] / i18npool / source / textconversion / genconv_dict.cxx
blobe3c9d595bc8fd271a336d372c4185493febe51ee
1 /*************************************************************************
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * Copyright 2008 by Sun Microsystems, Inc.
7 * OpenOffice.org - a multi-platform office productivity suite
9 * $RCSfile: genconv_dict.cxx,v $
10 * $Revision: 1.12.22.1 $
12 * This file is part of OpenOffice.org.
14 * OpenOffice.org is free software: you can redistribute it and/or modify
15 * it under the terms of the GNU Lesser General Public License version 3
16 * only, as published by the Free Software Foundation.
18 * OpenOffice.org is distributed in the hope that it will be useful,
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 * GNU Lesser General Public License version 3 for more details
22 * (a copy is included in the LICENSE file that accompanied this code).
24 * You should have received a copy of the GNU Lesser General Public License
25 * version 3 along with OpenOffice.org. If not, see
26 * <http://www.openoffice.org/license.html>
27 * for a copy of the LGPLv3 License.
29 ************************************************************************/
31 // MARKER(update_precomp.py): autogen include statement, do not remove
32 #include "precompiled_i18npool.hxx"
34 #include <stdio.h>
35 #include <string.h>
36 #include <stdlib.h>
37 #include <sal/main.h>
38 #include <sal/types.h>
39 #include <rtl/strbuf.hxx>
40 #include <rtl/ustring.hxx>
42 using namespace ::rtl;
44 void make_hhc_char(FILE *sfp, FILE *cfp);
45 void make_stc_char(FILE *sfp, FILE *cfp);
46 void make_stc_word(FILE *sfp, FILE *cfp);
48 /* Main Procedure */
50 SAL_IMPLEMENT_MAIN_WITH_ARGS(argc, argv)
52 FILE *sfp, *cfp;
54 if (argc < 4) exit(-1);
57 sfp = fopen(argv[2], "rb"); // open the source file for read;
58 if (sfp == NULL)
60 printf("Open the dictionary source file failed.");
61 return -1;
64 // create the C source file to write
65 cfp = fopen(argv[3], "wb");
66 if (cfp == NULL) {
67 fclose(sfp);
68 printf("Can't create the C source file.");
69 return -1;
72 fprintf(cfp, "/*\n");
73 fprintf(cfp, " * Copyright(c) 1999 - 2000, Sun Microsystems, Inc.\n");
74 fprintf(cfp, " * All Rights Reserved.\n");
75 fprintf(cfp, " */\n\n");
76 fprintf(cfp, "/* !!!The file is generated automatically. DONOT edit the file manually!!! */\n\n");
77 fprintf(cfp, "#include <sal/types.h>\n");
78 fprintf(cfp, "#include <textconversion.hxx>\n");
79 fprintf(cfp, "\nextern \"C\" {\n");
81 if (strcmp(argv[1], "hhc_char") == 0)
82 make_hhc_char(sfp, cfp);
83 else if (strcmp(argv[1], "stc_char") == 0)
84 make_stc_char(sfp, cfp);
85 else if (strcmp(argv[1], "stc_word") == 0)
86 make_stc_word(sfp, cfp);
88 fprintf (cfp, "}\n");
90 fclose(sfp);
91 fclose(cfp);
93 return 0;
94 } // end of main
96 // Hangul/Hanja character conversion
97 void make_hhc_char(FILE *sfp, FILE *cfp)
99 sal_Int32 count, address, i, j, k;
100 sal_Unicode Hanja2HangulData[0x10000];
101 for (i = 0; i < 0x10000; i++) {
102 Hanja2HangulData[i] = 0;
104 sal_uInt16 Hangul2HanjaData[10000][3];
106 // generate main dict. data array
107 fprintf(cfp, "\nstatic const sal_Unicode Hangul2HanjaData[] = {");
109 sal_Char Cstr[1024];
110 count = 0;
111 address = 0;
112 while (fgets(Cstr, 1024, sfp)) {
113 // input file is in UTF-8 encoding (Hangul:Hanja)
114 // don't convert last new line character to Ostr.
115 OUString Ostr((const sal_Char *)Cstr, strlen(Cstr) - 1, RTL_TEXTENCODING_UTF8);
116 const sal_Unicode *Ustr = Ostr.getStr();
117 sal_Int32 len = Ostr.getLength();
119 Hangul2HanjaData[count][0] = Ustr[0];
120 Hangul2HanjaData[count][1] = sal::static_int_cast<sal_uInt16>( address );
121 Hangul2HanjaData[count][2] = sal::static_int_cast<sal_uInt16>( len - 2 );
122 count++;
124 for (i = 2; i < len; i++) {
125 Hanja2HangulData[Ustr[i]] = Ustr[0];
126 if (address++ % 16 == 0)
127 fprintf(cfp, "\n\t");
128 fprintf(cfp, "0x%04x, ", Ustr[i]);
131 fprintf(cfp, "\n};\n");
133 fprintf(cfp, "\nstatic const com::sun::star::i18n::Hangul_Index Hangul2HanjaIndex[] = {\n");
134 for (i = 0; i < count; i++)
135 fprintf(cfp, "\t{ 0x%04x, 0x%04x, 0x%02x },\n",
136 Hangul2HanjaData[i][0],
137 Hangul2HanjaData[i][1],
138 Hangul2HanjaData[i][2]);
139 fprintf(cfp, "};\n");
141 fprintf(cfp, "\nstatic const sal_uInt16 Hanja2HangulIndex[] = {");
143 address=0;
144 for (i = 0; i < 0x10; i++) {
145 fprintf(cfp, "\n\t");
146 for (j = 0; j < 0x10; j++) {
147 for (k = 0; k < 0x100; k++) {
148 if (Hanja2HangulData[((i*0x10)+j)*0x100+k] != 0)
149 break;
151 fprintf(
152 cfp, "0x%04lx, ",
153 sal::static_int_cast< unsigned long >(
154 k < 0x100 ? (address++)*0x100 : 0xFFFF));
157 fprintf(cfp, "\n};\n");
159 fprintf(cfp, "\nstatic const sal_Unicode Hanja2HangulData[] = {");
161 for (i = 0; i < 0x100; i++) {
162 for (j = 0; j < 0x100; j++) {
163 if (Hanja2HangulData[i*0x100+j] != 0)
164 break;
166 if (j < 0x100) {
167 for (j = 0; j < 0x10; j++) {
168 fprintf(cfp, "\n\t");
169 for (k = 0; k < 0x10; k++) {
170 sal_Unicode c = Hanja2HangulData[((i*0x10+j)*0x10)+k];
171 fprintf(cfp, "0x%04x, ", c ? c : 0xFFFF);
176 fprintf(cfp, "\n};\n");
178 // create function to return arrays
179 fprintf (cfp, "\tconst sal_Unicode* getHangul2HanjaData() { return Hangul2HanjaData; }\n");
180 fprintf (cfp, "\tconst com::sun::star::i18n::Hangul_Index* getHangul2HanjaIndex() { return Hangul2HanjaIndex; }\n");
181 fprintf (cfp, "\tsal_Int16 getHangul2HanjaIndexCount() { return sizeof(Hangul2HanjaIndex) / sizeof(com::sun::star::i18n::Hangul_Index); }\n");
182 fprintf (cfp, "\tconst sal_uInt16* getHanja2HangulIndex() { return Hanja2HangulIndex; }\n");
183 fprintf (cfp, "\tconst sal_Unicode* getHanja2HangulData() { return Hanja2HangulData; }\n");
186 // Simplified/Traditional Chinese character conversion
187 void make_stc_char(FILE *sfp, FILE *cfp)
189 sal_Int32 address, i, j, k;
190 sal_Unicode SChinese2TChineseData[0x10000];
191 sal_Unicode SChinese2VChineseData[0x10000];
192 sal_Unicode TChinese2SChineseData[0x10000];
193 for (i = 0; i < 0x10000; i++) {
194 SChinese2TChineseData[i] = 0;
195 SChinese2VChineseData[i] = 0;
196 TChinese2SChineseData[i] = 0;
199 sal_Char Cstr[1024];
200 while (fgets(Cstr, 1024, sfp)) {
201 // input file is in UTF-8 encoding (SChinese:TChinese)
202 // don't convert last new line character to Ostr.
203 OUString Ostr((const sal_Char *)Cstr, strlen(Cstr) - 1, RTL_TEXTENCODING_UTF8);
204 const sal_Unicode *Ustr = Ostr.getStr();
205 sal_Int32 len = Ostr.getLength();
206 if (Ustr[1] == sal_Unicode('v'))
207 SChinese2VChineseData[Ustr[0]] = Ustr[2];
208 else {
209 SChinese2TChineseData[Ustr[0]] = Ustr[2];
210 if (SChinese2VChineseData[Ustr[0]] == 0)
211 SChinese2VChineseData[Ustr[0]] = Ustr[2];
213 for (i = 2; i < len; i++)
214 TChinese2SChineseData[Ustr[i]] = Ustr[0];
217 fprintf(cfp, "\nstatic const sal_uInt16 STC_CharIndex_S2T[] = {");
219 address=0;
220 for (i = 0; i < 0x10; i++) {
221 fprintf(cfp, "\n\t");
222 for (j = 0; j < 0x10; j++) {
223 for (k = 0; k < 0x100; k++) {
224 if (SChinese2TChineseData[((i*0x10)+j)*0x100+k] != 0)
225 break;
227 fprintf(
228 cfp, "0x%04lx, ",
229 sal::static_int_cast< unsigned long >(
230 k < 0x100 ? (address++)*0x100 : 0xFFFF));
233 fprintf(cfp, "\n};\n");
235 fprintf(cfp, "\nstatic const sal_Unicode STC_CharData_S2T[] = {");
237 for (i = 0; i < 0x100; i++) {
238 for (j = 0; j < 0x100; j++) {
239 if (SChinese2TChineseData[i*0x100+j] != 0)
240 break;
242 if (j < 0x100) {
243 for (j = 0; j < 0x10; j++) {
244 fprintf(cfp, "\n\t");
245 for (k = 0; k < 0x10; k++) {
246 sal_Unicode c = SChinese2TChineseData[((i*0x10+j)*0x10)+k];
247 fprintf(cfp, "0x%04x, ", c ? c : 0xFFFF);
252 fprintf(cfp, "\n};\n");
254 fprintf(cfp, "\nstatic const sal_uInt16 STC_CharIndex_S2V[] = {");
256 address=0;
257 for (i = 0; i < 0x10; i++) {
258 fprintf(cfp, "\n\t");
259 for (j = 0; j < 0x10; j++) {
260 for (k = 0; k < 0x100; k++) {
261 if (SChinese2VChineseData[((i*0x10)+j)*0x100+k] != 0)
262 break;
264 fprintf(
265 cfp, "0x%04lx, ",
266 sal::static_int_cast< unsigned long >(
267 k < 0x100 ? (address++)*0x100 : 0xFFFF));
270 fprintf(cfp, "\n};\n");
272 fprintf(cfp, "\nstatic const sal_Unicode STC_CharData_S2V[] = {");
274 for (i = 0; i < 0x100; i++) {
275 for (j = 0; j < 0x100; j++) {
276 if (SChinese2VChineseData[i*0x100+j] != 0)
277 break;
279 if (j < 0x100) {
280 for (j = 0; j < 0x10; j++) {
281 fprintf(cfp, "\n\t");
282 for (k = 0; k < 0x10; k++) {
283 sal_Unicode c = SChinese2VChineseData[((i*0x10+j)*0x10)+k];
284 fprintf(cfp, "0x%04x, ", c ? c : 0xFFFF);
289 fprintf(cfp, "\n};\n");
291 fprintf(cfp, "\nstatic const sal_uInt16 STC_CharIndex_T2S[] = {");
293 address=0;
294 for (i = 0; i < 0x10; i++) {
295 fprintf(cfp, "\n\t");
296 for (j = 0; j < 0x10; j++) {
297 for (k = 0; k < 0x100; k++) {
298 if (TChinese2SChineseData[((i*0x10)+j)*0x100+k] != 0)
299 break;
301 fprintf(
302 cfp, "0x%04lx, ",
303 sal::static_int_cast< unsigned long >(
304 k < 0x100 ? (address++)*0x100 : 0xFFFF));
307 fprintf(cfp, "\n};\n");
309 fprintf(cfp, "\nstatic const sal_Unicode STC_CharData_T2S[] = {");
311 for (i = 0; i < 0x100; i++) {
312 for (j = 0; j < 0x100; j++) {
313 if (TChinese2SChineseData[i*0x100+j] != 0)
314 break;
316 if (j < 0x100) {
317 for (j = 0; j < 0x10; j++) {
318 fprintf(cfp, "\n\t");
319 for (k = 0; k < 0x10; k++) {
320 sal_Unicode c = TChinese2SChineseData[((i*0x10+j)*0x10)+k];
321 fprintf(cfp, "0x%04x, ", c ? c : 0xFFFF);
326 fprintf(cfp, "\n};\n");
328 // create function to return arrays
329 fprintf (cfp, "\tconst sal_uInt16* getSTC_CharIndex_S2T() { return STC_CharIndex_S2T; }\n");
330 fprintf (cfp, "\tconst sal_Unicode* getSTC_CharData_S2T() { return STC_CharData_S2T; }\n");
331 fprintf (cfp, "\tconst sal_uInt16* getSTC_CharIndex_S2V() { return STC_CharIndex_S2V; }\n");
332 fprintf (cfp, "\tconst sal_Unicode* getSTC_CharData_S2V() { return STC_CharData_S2V; }\n");
333 fprintf (cfp, "\tconst sal_uInt16* getSTC_CharIndex_T2S() { return STC_CharIndex_T2S; }\n");
334 fprintf (cfp, "\tconst sal_Unicode* getSTC_CharData_T2S() { return STC_CharData_T2S; }\n");
338 typedef struct {
339 sal_uInt16 address;
340 sal_Int32 len;
341 sal_Unicode *data;
342 } Index;
344 extern "C" {
345 int Index_comp(const void* s1, const void* s2)
347 Index *p1 = (Index*)s1, *p2 = (Index*)s2;
348 int result = p1->len - p2->len;
349 for (int i = 0; result == 0 && i < p1->len; i++)
350 result = *(p1->data+i) - *(p2->data+i);
351 return result;
355 // Simplified/Traditional Chinese word conversion
356 void make_stc_word(FILE *sfp, FILE *cfp)
358 sal_Int32 count, i, length;
359 sal_Unicode STC_WordData[0x10000];
360 Index *STC_WordEntry_S2T = (Index*) malloc(0x10000 * sizeof(Index));
361 Index *STC_WordEntry_T2S = (Index*) malloc(0x10000 * sizeof(Index));
362 sal_Int32 count_S2T = 0, count_T2S = 0;
363 sal_Int32 line = 0, char_total = 0;
364 sal_Char Cstr[1024];
366 while (fgets(Cstr, 1024, sfp)) {
367 // input file is in UTF-8 encoding (SChinese:TChinese)
368 // don't convert last new line character to Ostr.
369 OUString Ostr((const sal_Char *)Cstr, strlen(Cstr) - 1, RTL_TEXTENCODING_UTF8);
370 sal_Int32 len = Ostr.getLength();
371 if (char_total + len + 1 > 0xFFFF) {
372 fprintf(stderr, "Word Dictionary stc_word.dic is too big (line %ld)", sal::static_int_cast< long >(line));
373 return;
375 sal_Int32 sep=-1, eq=-1, gt=-1, lt=-1;
376 if (((sep = eq = Ostr.indexOf(sal_Unicode('='))) > 0) ||
377 ((sep = gt = Ostr.indexOf(sal_Unicode('>'))) > 0) ||
378 ((sep = lt = Ostr.indexOf(sal_Unicode('<'))) > 0)) {
380 if (eq > 0 || gt > 0) {
381 STC_WordEntry_S2T[count_S2T].address = sal::static_int_cast<sal_uInt16>( char_total );
382 STC_WordEntry_S2T[count_S2T].len = sep;
383 STC_WordEntry_S2T[count_S2T++].data = &STC_WordData[char_total];
385 if (eq > 0 || lt > 0) {
386 STC_WordEntry_T2S[count_T2S].address = sal::static_int_cast<sal_uInt16>( char_total + sep + 1 );
387 STC_WordEntry_T2S[count_T2S].len = len - sep - 1;
388 STC_WordEntry_T2S[count_T2S++].data = &STC_WordData[char_total + sep + 1];
390 for (i = 0; i < len; i++)
391 STC_WordData[char_total++] = (i == sep) ? 0 : Ostr[i];
392 STC_WordData[char_total++] = 0;
393 } else {
394 fprintf(stderr, "Invalid entry in stc_word.dic (line %ld)", sal::static_int_cast< long >(line));
395 return;
397 line++;
400 if (char_total > 0) {
401 fprintf(cfp, "\nstatic const sal_Unicode STC_WordData[] = {");
402 for (i = 0; i < char_total; i++) {
403 if (i % 32 == 0) fprintf(cfp, "\n\t");
404 fprintf(cfp, "0x%04x, ", STC_WordData[i]);
406 fprintf(cfp, "\n};\n");
408 fprintf(cfp, "\nstatic sal_Int32 STC_WordData_Count = %ld;\n", sal::static_int_cast< long >(char_total));
410 // create function to return arrays
411 fprintf (cfp, "\tconst sal_Unicode* getSTC_WordData(sal_Int32& count) { count = STC_WordData_Count; return STC_WordData; }\n");
412 } else {
413 fprintf (cfp, "\tconst sal_Unicode* getSTC_WordData(sal_Int32& count) { count = 0; return NULL; }\n");
416 sal_uInt16 STC_WordIndex[0x100];
418 if (count_S2T > 0) {
419 qsort(STC_WordEntry_S2T, count_S2T, sizeof(Index), Index_comp);
421 fprintf(cfp, "\nstatic const sal_uInt16 STC_WordEntry_S2T[] = {");
422 count = 0;
423 length = 0;
424 for (i = 0; i < count_S2T; i++) {
425 if (i % 32 == 0) fprintf(cfp, "\n\t");
426 fprintf(cfp, "0x%04x, ", STC_WordEntry_S2T[i].address);
427 if (STC_WordEntry_S2T[i].len != length) {
428 length = STC_WordEntry_S2T[i].len;
429 while (count <= length)
430 STC_WordIndex[count++] = sal::static_int_cast<sal_uInt16>(i);
433 fprintf(cfp, "\n};\n");
434 STC_WordIndex[count++] = sal::static_int_cast<sal_uInt16>(i);
436 fprintf(cfp, "\nstatic const sal_uInt16 STC_WordIndex_S2T[] = {");
437 for (i = 0; i < count; i++) {
438 if (i % 16 == 0) fprintf(cfp, "\n\t");
439 fprintf(cfp, "0x%04x, ", STC_WordIndex[i]);
441 fprintf(cfp, "\n};\n");
443 fprintf(cfp, "\nstatic sal_Int32 STC_WordIndex_S2T_Count = %ld;\n", sal::static_int_cast< long >(length));
444 fprintf (cfp, "\tconst sal_uInt16* getSTC_WordEntry_S2T() { return STC_WordEntry_S2T; }\n");
445 fprintf (cfp, "\tconst sal_uInt16* getSTC_WordIndex_S2T(sal_Int32& count) { count = STC_WordIndex_S2T_Count; return STC_WordIndex_S2T; }\n");
446 } else {
447 fprintf (cfp, "\tconst sal_uInt16* getSTC_WordEntry_S2T() { return NULL; }\n");
448 fprintf (cfp, "\tconst sal_uInt16* getSTC_WordIndex_S2T(sal_Int32& count) { count = 0; return NULL; }\n");
451 if (count_T2S > 0) {
452 qsort(STC_WordEntry_T2S, count_T2S, sizeof(Index), Index_comp);
454 fprintf(cfp, "\nstatic const sal_uInt16 STC_WordEntry_T2S[] = {");
455 count = 0;
456 length = 0;
457 for (i = 0; i < count_T2S; i++) {
458 if (i % 32 == 0) fprintf(cfp, "\n\t");
459 fprintf(cfp, "0x%04x, ", STC_WordEntry_T2S[i].address);
460 if (STC_WordEntry_T2S[i].len != length) {
461 length = STC_WordEntry_T2S[i].len;
462 while (count <= length)
463 STC_WordIndex[count++] = sal::static_int_cast<sal_uInt16>(i);
466 STC_WordIndex[count++] = sal::static_int_cast<sal_uInt16>(i);
467 fprintf(cfp, "\n};\n");
469 fprintf(cfp, "\nstatic const sal_uInt16 STC_WordIndex_T2S[] = {");
470 for (i = 0; i < count; i++) {
471 if (i % 16 == 0) fprintf(cfp, "\n\t");
472 fprintf(cfp, "0x%04x, ", STC_WordIndex[i]);
474 fprintf(cfp, "\n};\n");
476 fprintf(cfp, "\nstatic sal_Int32 STC_WordIndex_T2S_Count = %ld;\n\n", sal::static_int_cast< long >(length));
477 fprintf (cfp, "\tconst sal_uInt16* getSTC_WordEntry_T2S() { return STC_WordEntry_T2S; }\n");
478 fprintf (cfp, "\tconst sal_uInt16* getSTC_WordIndex_T2S(sal_Int32& count) { count = STC_WordIndex_T2S_Count; return STC_WordIndex_T2S; }\n");
479 } else {
480 fprintf (cfp, "\tconst sal_uInt16* getSTC_WordEntry_T2S() { return NULL; }\n");
481 fprintf (cfp, "\tconst sal_uInt16* getSTC_WordIndex_T2S(sal_Int32& count) { count = 0; return NULL; }\n");
483 free(STC_WordEntry_S2T);
484 free(STC_WordEntry_T2S);