Version 4.0.0.1, tag libreoffice-4.0.0.1
[LibreOffice.git] / i18npool / source / textconversion / genconv_dict.cxx
blob77a676fff3563915b2db52ac1dfd65a6775a73f8
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
21 #include <stdio.h>
22 #include <string.h>
23 #include <stdlib.h>
24 #include <errno.h>
25 #include <sal/main.h>
26 #include <sal/types.h>
27 #include <rtl/strbuf.hxx>
28 #include <rtl/ustring.hxx>
30 #include <vector>
32 using namespace ::rtl;
34 void make_hhc_char(FILE *sfp, FILE *cfp);
35 void make_stc_char(FILE *sfp, FILE *cfp);
36 void make_stc_word(FILE *sfp, FILE *cfp);
38 /* Main Procedure */
40 SAL_IMPLEMENT_MAIN_WITH_ARGS(argc, argv)
42 FILE *sfp, *cfp;
44 if (argc < 4) exit(-1);
47 sfp = fopen(argv[2], "rb"); // open the source file for read;
48 if (sfp == NULL)
50 fprintf(stderr, "Opening the dictionary source file %s for reading failed: %s\n", argv[1], strerror(errno));
51 exit(1);
54 // create the C source file to write
55 cfp = fopen(argv[3], "wb");
56 if (cfp == NULL) {
57 fclose(sfp);
58 fprintf(stderr, "Opening %s for writing failed: %s\n", argv[3], strerror(errno));
59 exit(1);
62 fprintf(cfp, "/*\n");
63 fprintf(cfp, " * Copyright(c) 1999 - 2000, Sun Microsystems, Inc.\n");
64 fprintf(cfp, " * All Rights Reserved.\n");
65 fprintf(cfp, " */\n\n");
66 fprintf(cfp, "/* !!!The file is generated automatically. DONOT edit the file manually!!! */\n\n");
67 fprintf(cfp, "#include <sal/types.h>\n");
68 fprintf(cfp, "#include <textconversion.hxx>\n");
69 fprintf(cfp, "\nextern \"C\" {\n");
71 if (strcmp(argv[1], "hhc_char") == 0)
72 make_hhc_char(sfp, cfp);
73 else if (strcmp(argv[1], "stc_char") == 0)
74 make_stc_char(sfp, cfp);
75 else if (strcmp(argv[1], "stc_word") == 0)
76 make_stc_word(sfp, cfp);
78 fprintf (cfp, "}\n");
80 fclose(sfp);
81 fclose(cfp);
83 return 0;
84 } // end of main
86 // Hangul/Hanja character conversion
87 void make_hhc_char(FILE *sfp, FILE *cfp)
89 sal_Int32 count, address, i, j, k;
90 sal_Unicode Hanja2HangulData[0x10000];
91 for (i = 0; i < 0x10000; i++) {
92 Hanja2HangulData[i] = 0;
94 sal_uInt16 Hangul2HanjaData[10000][3];
96 // generate main dict. data array
97 fprintf(cfp, "\nstatic const sal_Unicode Hangul2HanjaData[] = {");
99 sal_Char Cstr[1024];
100 count = 0;
101 address = 0;
102 while (fgets(Cstr, 1024, sfp)) {
103 // input file is in UTF-8 encoding (Hangul:Hanja)
104 // don't convert last new line character to Ostr.
105 OUString Ostr((const sal_Char *)Cstr, strlen(Cstr) - 1, RTL_TEXTENCODING_UTF8);
106 const sal_Unicode *Ustr = Ostr.getStr();
107 sal_Int32 len = Ostr.getLength();
109 Hangul2HanjaData[count][0] = Ustr[0];
110 Hangul2HanjaData[count][1] = sal::static_int_cast<sal_uInt16>( address );
111 Hangul2HanjaData[count][2] = sal::static_int_cast<sal_uInt16>( len - 2 );
112 count++;
114 for (i = 2; i < len; i++) {
115 Hanja2HangulData[Ustr[i]] = Ustr[0];
116 if (address++ % 16 == 0)
117 fprintf(cfp, "\n\t");
118 fprintf(cfp, "0x%04x, ", Ustr[i]);
121 fprintf(cfp, "\n};\n");
123 fprintf(cfp, "\nstatic const com::sun::star::i18n::Hangul_Index Hangul2HanjaIndex[] = {\n");
124 for (i = 0; i < count; i++)
125 fprintf(cfp, "\t{ 0x%04x, 0x%04x, 0x%02x },\n",
126 Hangul2HanjaData[i][0],
127 Hangul2HanjaData[i][1],
128 Hangul2HanjaData[i][2]);
129 fprintf(cfp, "};\n");
131 fprintf(cfp, "\nstatic const sal_uInt16 Hanja2HangulIndex[] = {");
133 address=0;
134 for (i = 0; i < 0x10; i++) {
135 fprintf(cfp, "\n\t");
136 for (j = 0; j < 0x10; j++) {
137 for (k = 0; k < 0x100; k++) {
138 if (Hanja2HangulData[((i*0x10)+j)*0x100+k] != 0)
139 break;
141 fprintf(
142 cfp, "0x%04lx, ",
143 sal::static_int_cast< unsigned long >(
144 k < 0x100 ? (address++)*0x100 : 0xFFFF));
147 fprintf(cfp, "\n};\n");
149 fprintf(cfp, "\nstatic const sal_Unicode Hanja2HangulData[] = {");
151 for (i = 0; i < 0x100; i++) {
152 for (j = 0; j < 0x100; j++) {
153 if (Hanja2HangulData[i*0x100+j] != 0)
154 break;
156 if (j < 0x100) {
157 for (j = 0; j < 0x10; j++) {
158 fprintf(cfp, "\n\t");
159 for (k = 0; k < 0x10; k++) {
160 sal_Unicode c = Hanja2HangulData[((i*0x10+j)*0x10)+k];
161 fprintf(cfp, "0x%04x, ", c ? c : 0xFFFF);
166 fprintf(cfp, "\n};\n");
168 // create function to return arrays
169 fprintf (cfp, "\tconst sal_Unicode* getHangul2HanjaData() { return Hangul2HanjaData; }\n");
170 fprintf (cfp, "\tconst com::sun::star::i18n::Hangul_Index* getHangul2HanjaIndex() { return Hangul2HanjaIndex; }\n");
171 fprintf (cfp, "\tsal_Int16 getHangul2HanjaIndexCount() { return sizeof(Hangul2HanjaIndex) / sizeof(com::sun::star::i18n::Hangul_Index); }\n");
172 fprintf (cfp, "\tconst sal_uInt16* getHanja2HangulIndex() { return Hanja2HangulIndex; }\n");
173 fprintf (cfp, "\tconst sal_Unicode* getHanja2HangulData() { return Hanja2HangulData; }\n");
176 // Simplified/Traditional Chinese character conversion
177 void make_stc_char(FILE *sfp, FILE *cfp)
179 sal_Int32 address, i, j, k;
180 sal_Unicode SChinese2TChineseData[0x10000];
181 sal_Unicode SChinese2VChineseData[0x10000];
182 sal_Unicode TChinese2SChineseData[0x10000];
183 for (i = 0; i < 0x10000; i++) {
184 SChinese2TChineseData[i] = 0;
185 SChinese2VChineseData[i] = 0;
186 TChinese2SChineseData[i] = 0;
189 sal_Char Cstr[1024];
190 while (fgets(Cstr, 1024, sfp)) {
191 // input file is in UTF-8 encoding (SChinese:TChinese)
192 // don't convert last new line character to Ostr.
193 OUString Ostr((const sal_Char *)Cstr, strlen(Cstr) - 1, RTL_TEXTENCODING_UTF8);
194 const sal_Unicode *Ustr = Ostr.getStr();
195 sal_Int32 len = Ostr.getLength();
196 if (Ustr[1] == sal_Unicode('v'))
197 SChinese2VChineseData[Ustr[0]] = Ustr[2];
198 else {
199 SChinese2TChineseData[Ustr[0]] = Ustr[2];
200 if (SChinese2VChineseData[Ustr[0]] == 0)
201 SChinese2VChineseData[Ustr[0]] = Ustr[2];
203 for (i = 2; i < len; i++)
204 TChinese2SChineseData[Ustr[i]] = Ustr[0];
207 fprintf(cfp, "\nstatic const sal_uInt16 STC_CharIndex_S2T[] = {");
209 address=0;
210 for (i = 0; i < 0x10; i++) {
211 fprintf(cfp, "\n\t");
212 for (j = 0; j < 0x10; j++) {
213 for (k = 0; k < 0x100; k++) {
214 if (SChinese2TChineseData[((i*0x10)+j)*0x100+k] != 0)
215 break;
217 fprintf(
218 cfp, "0x%04lx, ",
219 sal::static_int_cast< unsigned long >(
220 k < 0x100 ? (address++)*0x100 : 0xFFFF));
223 fprintf(cfp, "\n};\n");
225 fprintf(cfp, "\nstatic const sal_Unicode STC_CharData_S2T[] = {");
227 for (i = 0; i < 0x100; i++) {
228 for (j = 0; j < 0x100; j++) {
229 if (SChinese2TChineseData[i*0x100+j] != 0)
230 break;
232 if (j < 0x100) {
233 for (j = 0; j < 0x10; j++) {
234 fprintf(cfp, "\n\t");
235 for (k = 0; k < 0x10; k++) {
236 sal_Unicode c = SChinese2TChineseData[((i*0x10+j)*0x10)+k];
237 fprintf(cfp, "0x%04x, ", c ? c : 0xFFFF);
242 fprintf(cfp, "\n};\n");
244 fprintf(cfp, "\nstatic const sal_uInt16 STC_CharIndex_S2V[] = {");
246 address=0;
247 for (i = 0; i < 0x10; i++) {
248 fprintf(cfp, "\n\t");
249 for (j = 0; j < 0x10; j++) {
250 for (k = 0; k < 0x100; k++) {
251 if (SChinese2VChineseData[((i*0x10)+j)*0x100+k] != 0)
252 break;
254 fprintf(
255 cfp, "0x%04lx, ",
256 sal::static_int_cast< unsigned long >(
257 k < 0x100 ? (address++)*0x100 : 0xFFFF));
260 fprintf(cfp, "\n};\n");
262 fprintf(cfp, "\nstatic const sal_Unicode STC_CharData_S2V[] = {");
264 for (i = 0; i < 0x100; i++) {
265 for (j = 0; j < 0x100; j++) {
266 if (SChinese2VChineseData[i*0x100+j] != 0)
267 break;
269 if (j < 0x100) {
270 for (j = 0; j < 0x10; j++) {
271 fprintf(cfp, "\n\t");
272 for (k = 0; k < 0x10; k++) {
273 sal_Unicode c = SChinese2VChineseData[((i*0x10+j)*0x10)+k];
274 fprintf(cfp, "0x%04x, ", c ? c : 0xFFFF);
279 fprintf(cfp, "\n};\n");
281 fprintf(cfp, "\nstatic const sal_uInt16 STC_CharIndex_T2S[] = {");
283 address=0;
284 for (i = 0; i < 0x10; i++) {
285 fprintf(cfp, "\n\t");
286 for (j = 0; j < 0x10; j++) {
287 for (k = 0; k < 0x100; k++) {
288 if (TChinese2SChineseData[((i*0x10)+j)*0x100+k] != 0)
289 break;
291 fprintf(
292 cfp, "0x%04lx, ",
293 sal::static_int_cast< unsigned long >(
294 k < 0x100 ? (address++)*0x100 : 0xFFFF));
297 fprintf(cfp, "\n};\n");
299 fprintf(cfp, "\nstatic const sal_Unicode STC_CharData_T2S[] = {");
301 for (i = 0; i < 0x100; i++) {
302 for (j = 0; j < 0x100; j++) {
303 if (TChinese2SChineseData[i*0x100+j] != 0)
304 break;
306 if (j < 0x100) {
307 for (j = 0; j < 0x10; j++) {
308 fprintf(cfp, "\n\t");
309 for (k = 0; k < 0x10; k++) {
310 sal_Unicode c = TChinese2SChineseData[((i*0x10+j)*0x10)+k];
311 fprintf(cfp, "0x%04x, ", c ? c : 0xFFFF);
316 fprintf(cfp, "\n};\n");
318 // create function to return arrays
319 fprintf (cfp, "\tconst sal_uInt16* getSTC_CharIndex_S2T() { return STC_CharIndex_S2T; }\n");
320 fprintf (cfp, "\tconst sal_Unicode* getSTC_CharData_S2T() { return STC_CharData_S2T; }\n");
321 fprintf (cfp, "\tconst sal_uInt16* getSTC_CharIndex_S2V() { return STC_CharIndex_S2V; }\n");
322 fprintf (cfp, "\tconst sal_Unicode* getSTC_CharData_S2V() { return STC_CharData_S2V; }\n");
323 fprintf (cfp, "\tconst sal_uInt16* getSTC_CharIndex_T2S() { return STC_CharIndex_T2S; }\n");
324 fprintf (cfp, "\tconst sal_Unicode* getSTC_CharData_T2S() { return STC_CharData_T2S; }\n");
328 typedef struct {
329 sal_uInt16 address;
330 sal_Int32 len;
331 sal_Unicode *data;
332 } Index;
334 extern "C" {
335 int Index_comp(const void* s1, const void* s2)
337 Index *p1 = (Index*)s1, *p2 = (Index*)s2;
338 int result = p1->len - p2->len;
339 for (int i = 0; result == 0 && i < p1->len; i++)
340 result = *(p1->data+i) - *(p2->data+i);
341 return result;
345 // Simplified/Traditional Chinese word conversion
346 void make_stc_word(FILE *sfp, FILE *cfp)
348 sal_Int32 count, i, length;
349 sal_Unicode STC_WordData[0x10000];
350 std::vector<Index> STC_WordEntry_S2T(0x10000);
351 std::vector<Index> STC_WordEntry_T2S(0x10000);
352 sal_Int32 count_S2T = 0, count_T2S = 0;
353 sal_Int32 line = 0, char_total = 0;
354 sal_Char Cstr[1024];
356 while (fgets(Cstr, 1024, sfp)) {
357 // input file is in UTF-8 encoding (SChinese:TChinese)
358 // don't convert last new line character to Ostr.
359 OUString Ostr((const sal_Char *)Cstr, strlen(Cstr) - 1, RTL_TEXTENCODING_UTF8);
360 sal_Int32 len = Ostr.getLength();
361 if (char_total + len + 1 > 0xFFFF) {
362 fprintf(stderr, "Word Dictionary stc_word.dic is too big (line %ld)", sal::static_int_cast< long >(line));
363 return;
365 sal_Int32 sep=-1, eq=-1, gt=-1, lt=-1;
366 if (((sep = eq = Ostr.indexOf(sal_Unicode('='))) > 0) ||
367 ((sep = gt = Ostr.indexOf(sal_Unicode('>'))) > 0) ||
368 ((sep = lt = Ostr.indexOf(sal_Unicode('<'))) > 0)) {
370 if (eq > 0 || gt > 0) {
371 STC_WordEntry_S2T[count_S2T].address = sal::static_int_cast<sal_uInt16>( char_total );
372 STC_WordEntry_S2T[count_S2T].len = sep;
373 STC_WordEntry_S2T[count_S2T++].data = &STC_WordData[char_total];
375 if (eq > 0 || lt > 0) {
376 STC_WordEntry_T2S[count_T2S].address = sal::static_int_cast<sal_uInt16>( char_total + sep + 1 );
377 STC_WordEntry_T2S[count_T2S].len = len - sep - 1;
378 STC_WordEntry_T2S[count_T2S++].data = &STC_WordData[char_total + sep + 1];
380 for (i = 0; i < len; i++)
381 STC_WordData[char_total++] = (i == sep) ? 0 : Ostr[i];
382 STC_WordData[char_total++] = 0;
383 } else {
384 fprintf(stderr, "Invalid entry in stc_word.dic (line %ld)", sal::static_int_cast< long >(line));
385 return;
387 line++;
390 if (char_total > 0) {
391 fprintf(cfp, "\nstatic const sal_Unicode STC_WordData[] = {");
392 for (i = 0; i < char_total; i++) {
393 if (i % 32 == 0) fprintf(cfp, "\n\t");
394 fprintf(cfp, "0x%04x, ", STC_WordData[i]);
396 fprintf(cfp, "\n};\n");
398 fprintf(cfp, "\nstatic sal_Int32 STC_WordData_Count = %ld;\n", sal::static_int_cast< long >(char_total));
400 // create function to return arrays
401 fprintf (cfp, "\tconst sal_Unicode* getSTC_WordData(sal_Int32& count) { count = STC_WordData_Count; return STC_WordData; }\n");
402 } else {
403 fprintf (cfp, "\tconst sal_Unicode* getSTC_WordData(sal_Int32& count) { count = 0; return NULL; }\n");
406 sal_uInt16 STC_WordIndex[0x100];
408 if (count_S2T > 0) {
409 qsort(&STC_WordEntry_S2T[0], count_S2T, sizeof(Index), Index_comp);
411 fprintf(cfp, "\nstatic const sal_uInt16 STC_WordEntry_S2T[] = {");
412 count = 0;
413 length = 0;
414 for (i = 0; i < count_S2T; i++) {
415 if (i % 32 == 0) fprintf(cfp, "\n\t");
416 fprintf(cfp, "0x%04x, ", STC_WordEntry_S2T[i].address);
417 if (STC_WordEntry_S2T[i].len != length) {
418 length = STC_WordEntry_S2T[i].len;
419 while (count <= length)
420 STC_WordIndex[count++] = sal::static_int_cast<sal_uInt16>(i);
423 fprintf(cfp, "\n};\n");
424 STC_WordIndex[count++] = sal::static_int_cast<sal_uInt16>(i);
426 fprintf(cfp, "\nstatic const sal_uInt16 STC_WordIndex_S2T[] = {");
427 for (i = 0; i < count; i++) {
428 if (i % 16 == 0) fprintf(cfp, "\n\t");
429 fprintf(cfp, "0x%04x, ", STC_WordIndex[i]);
431 fprintf(cfp, "\n};\n");
433 fprintf(cfp, "\nstatic sal_Int32 STC_WordIndex_S2T_Count = %ld;\n", sal::static_int_cast< long >(length));
434 fprintf (cfp, "\tconst sal_uInt16* getSTC_WordEntry_S2T() { return STC_WordEntry_S2T; }\n");
435 fprintf (cfp, "\tconst sal_uInt16* getSTC_WordIndex_S2T(sal_Int32& count) { count = STC_WordIndex_S2T_Count; return STC_WordIndex_S2T; }\n");
436 } else {
437 fprintf (cfp, "\tconst sal_uInt16* getSTC_WordEntry_S2T() { return NULL; }\n");
438 fprintf (cfp, "\tconst sal_uInt16* getSTC_WordIndex_S2T(sal_Int32& count) { count = 0; return NULL; }\n");
441 if (count_T2S > 0) {
442 qsort(&STC_WordEntry_T2S[0], count_T2S, sizeof(Index), Index_comp);
444 fprintf(cfp, "\nstatic const sal_uInt16 STC_WordEntry_T2S[] = {");
445 count = 0;
446 length = 0;
447 for (i = 0; i < count_T2S; i++) {
448 if (i % 32 == 0) fprintf(cfp, "\n\t");
449 fprintf(cfp, "0x%04x, ", STC_WordEntry_T2S[i].address);
450 if (STC_WordEntry_T2S[i].len != length) {
451 length = STC_WordEntry_T2S[i].len;
452 while (count <= length)
453 STC_WordIndex[count++] = sal::static_int_cast<sal_uInt16>(i);
456 STC_WordIndex[count++] = sal::static_int_cast<sal_uInt16>(i);
457 fprintf(cfp, "\n};\n");
459 fprintf(cfp, "\nstatic const sal_uInt16 STC_WordIndex_T2S[] = {");
460 for (i = 0; i < count; i++) {
461 if (i % 16 == 0) fprintf(cfp, "\n\t");
462 fprintf(cfp, "0x%04x, ", STC_WordIndex[i]);
464 fprintf(cfp, "\n};\n");
466 fprintf(cfp, "\nstatic sal_Int32 STC_WordIndex_T2S_Count = %ld;\n\n", sal::static_int_cast< long >(length));
467 fprintf (cfp, "\tconst sal_uInt16* getSTC_WordEntry_T2S() { return STC_WordEntry_T2S; }\n");
468 fprintf (cfp, "\tconst sal_uInt16* getSTC_WordIndex_T2S(sal_Int32& count) { count = STC_WordIndex_T2S_Count; return STC_WordIndex_T2S; }\n");
469 } else {
470 fprintf (cfp, "\tconst sal_uInt16* getSTC_WordEntry_T2S() { return NULL; }\n");
471 fprintf (cfp, "\tconst sal_uInt16* getSTC_WordIndex_T2S(sal_Int32& count) { count = 0; return NULL; }\n");
475 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */