Version 6.4.0.0.beta1, tag libreoffice-6.4.0.0.beta1
[LibreOffice.git] / i18npool / source / textconversion / genconv_dict.cxx
blob4173d898622dc0bd49a40da4944e9cccba6ef928
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
21 #include <stdio.h>
22 #include <string.h>
23 #include <stdlib.h>
24 #include <errno.h>
25 #include <sal/main.h>
26 #include <sal/types.h>
27 #include <rtl/ustring.hxx>
29 #include <vector>
31 static void make_hhc_char(FILE *sfp, FILE *cfp);
32 static void make_stc_char(FILE *sfp, FILE *cfp);
33 static void make_stc_word(FILE *sfp, FILE *cfp);
35 /* Main Procedure */
37 SAL_IMPLEMENT_MAIN_WITH_ARGS(argc, argv)
39 FILE *sfp, *cfp;
41 if (argc < 4) exit(-1);
44 sfp = fopen(argv[2], "rb"); // open the source file for read;
45 if (sfp == nullptr)
47 fprintf(stderr, "Opening the dictionary source file %s for reading failed: %s\n", argv[1], strerror(errno));
48 exit(1);
51 // create the C source file to write
52 cfp = fopen(argv[3], "wb");
53 if (cfp == nullptr) {
54 fclose(sfp);
55 fprintf(stderr, "Opening %s for writing failed: %s\n", argv[3], strerror(errno));
56 exit(1);
59 fprintf(cfp, "/*\n");
60 fprintf(cfp, " * Copyright(c) 1999 - 2000, Sun Microsystems, Inc.\n");
61 fprintf(cfp, " * All Rights Reserved.\n");
62 fprintf(cfp, " */\n\n");
63 fprintf(cfp, "/* !!!The file is generated automatically. DO NOT edit the file manually!!! */\n\n");
64 fprintf(cfp, "#include <sal/types.h>\n");
65 fprintf(cfp, "#include <textconversion.hxx>\n");
66 fprintf(cfp, "\nextern \"C\" {\n");
68 if (strcmp(argv[1], "hhc_char") == 0)
69 make_hhc_char(sfp, cfp);
70 else if (strcmp(argv[1], "stc_char") == 0)
71 make_stc_char(sfp, cfp);
72 else if (strcmp(argv[1], "stc_word") == 0)
73 make_stc_word(sfp, cfp);
75 fprintf (cfp, "}\n");
77 fclose(sfp);
78 fclose(cfp);
80 return 0;
81 } // end of main
83 // Hangul/Hanja character conversion
84 void make_hhc_char(FILE *sfp, FILE *cfp)
86 sal_Int32 count, address, i, j, k;
87 sal_Unicode Hanja2HangulData[0x10000];
88 for (i = 0; i < 0x10000; i++) {
89 Hanja2HangulData[i] = 0;
91 sal_uInt16 Hangul2HanjaData[10000][3];
93 // generate main dict. data array
94 fprintf(cfp, "\nstatic const sal_Unicode Hangul2HanjaData[] = {");
96 sal_Char Cstr[1024];
97 count = 0;
98 address = 0;
99 while (fgets(Cstr, 1024, sfp)) {
100 // input file is in UTF-8 encoding (Hangul:Hanja)
101 // don't convert last new line character to Ostr.
102 OUString Ostr(Cstr, strlen(Cstr) - 1, RTL_TEXTENCODING_UTF8);
103 sal_Int32 len = Ostr.getLength();
105 Hangul2HanjaData[count][0] = Ostr[0];
106 Hangul2HanjaData[count][1] = sal::static_int_cast<sal_uInt16>( address );
107 Hangul2HanjaData[count][2] = sal::static_int_cast<sal_uInt16>( len - 2 );
108 count++;
110 for (i = 2; i < len; i++) {
111 Hanja2HangulData[Ostr[i]] = Ostr[0];
112 if (address++ % 16 == 0)
113 fprintf(cfp, "\n\t");
114 fprintf(cfp, "0x%04x, ", Ostr[i]);
117 fprintf(cfp, "\n};\n");
119 fprintf(cfp, "\nstatic const i18npool::Hangul_Index Hangul2HanjaIndex[] = {\n");
120 for (i = 0; i < count; i++)
121 fprintf(cfp, "\t{ 0x%04x, 0x%04x, 0x%02x },\n",
122 Hangul2HanjaData[i][0],
123 Hangul2HanjaData[i][1],
124 Hangul2HanjaData[i][2]);
125 fprintf(cfp, "};\n");
127 fprintf(cfp, "\nstatic const sal_uInt16 Hanja2HangulIndex[] = {");
129 address=0;
130 for (i = 0; i < 0x10; i++) {
131 fprintf(cfp, "\n\t");
132 for (j = 0; j < 0x10; j++) {
133 for (k = 0; k < 0x100; k++) {
134 if (Hanja2HangulData[((i*0x10)+j)*0x100+k] != 0)
135 break;
137 fprintf(
138 cfp, "0x%04lx, ",
139 sal::static_int_cast< unsigned long >(
140 k < 0x100 ? (address++)*0x100 : 0xFFFF));
143 fprintf(cfp, "\n};\n");
145 fprintf(cfp, "\nstatic const sal_Unicode Hanja2HangulData[] = {");
147 for (i = 0; i < 0x100; i++) {
148 for (j = 0; j < 0x100; j++) {
149 if (Hanja2HangulData[i*0x100+j] != 0)
150 break;
152 if (j < 0x100) {
153 for (j = 0; j < 0x10; j++) {
154 fprintf(cfp, "\n\t");
155 for (k = 0; k < 0x10; k++) {
156 sal_Unicode c = Hanja2HangulData[((i*0x10+j)*0x10)+k];
157 fprintf(cfp, "0x%04x, ", c ? c : 0xFFFF);
162 fprintf(cfp, "\n};\n");
164 // create function to return arrays
165 fprintf (cfp, "\tconst sal_Unicode* getHangul2HanjaData() { return Hangul2HanjaData; }\n");
166 fprintf (cfp, "\tconst i18npool::Hangul_Index* getHangul2HanjaIndex() { return Hangul2HanjaIndex; }\n");
167 fprintf (cfp, "\tsal_Int16 getHangul2HanjaIndexCount() { return sizeof(Hangul2HanjaIndex) / sizeof(i18npool::Hangul_Index); }\n");
168 fprintf (cfp, "\tconst sal_uInt16* getHanja2HangulIndex() { return Hanja2HangulIndex; }\n");
169 fprintf (cfp, "\tconst sal_Unicode* getHanja2HangulData() { return Hanja2HangulData; }\n");
172 // Simplified/Traditional Chinese character conversion
173 void make_stc_char(FILE *sfp, FILE *cfp)
175 sal_Int32 address, i, j, k;
176 sal_Unicode SChinese2TChineseData[0x10000];
177 sal_Unicode SChinese2VChineseData[0x10000];
178 sal_Unicode TChinese2SChineseData[0x10000];
179 for (i = 0; i < 0x10000; i++) {
180 SChinese2TChineseData[i] = 0;
181 SChinese2VChineseData[i] = 0;
182 TChinese2SChineseData[i] = 0;
185 sal_Char Cstr[1024];
186 while (fgets(Cstr, 1024, sfp)) {
187 // input file is in UTF-8 encoding (SChinese:TChinese)
188 // don't convert last new line character to Ostr.
189 OUString Ostr(Cstr, strlen(Cstr) - 1, RTL_TEXTENCODING_UTF8);
190 sal_Int32 len = Ostr.getLength();
191 if (Ostr[1] == 'v')
192 SChinese2VChineseData[Ostr[0]] = Ostr[2];
193 else {
194 SChinese2TChineseData[Ostr[0]] = Ostr[2];
195 if (SChinese2VChineseData[Ostr[0]] == 0)
196 SChinese2VChineseData[Ostr[0]] = Ostr[2];
198 for (i = 2; i < len; i++)
199 TChinese2SChineseData[Ostr[i]] = Ostr[0];
202 fprintf(cfp, "\nstatic const sal_uInt16 STC_CharIndex_S2T[] = {");
204 address=0;
205 for (i = 0; i < 0x10; i++) {
206 fprintf(cfp, "\n\t");
207 for (j = 0; j < 0x10; j++) {
208 for (k = 0; k < 0x100; k++) {
209 if (SChinese2TChineseData[((i*0x10)+j)*0x100+k] != 0)
210 break;
212 fprintf(
213 cfp, "0x%04lx, ",
214 sal::static_int_cast< unsigned long >(
215 k < 0x100 ? (address++)*0x100 : 0xFFFF));
218 fprintf(cfp, "\n};\n");
220 fprintf(cfp, "\nstatic const sal_Unicode STC_CharData_S2T[] = {");
222 for (i = 0; i < 0x100; i++) {
223 for (j = 0; j < 0x100; j++) {
224 if (SChinese2TChineseData[i*0x100+j] != 0)
225 break;
227 if (j < 0x100) {
228 for (j = 0; j < 0x10; j++) {
229 fprintf(cfp, "\n\t");
230 for (k = 0; k < 0x10; k++) {
231 sal_Unicode c = SChinese2TChineseData[((i*0x10+j)*0x10)+k];
232 fprintf(cfp, "0x%04x, ", c ? c : 0xFFFF);
237 fprintf(cfp, "\n};\n");
239 fprintf(cfp, "\nstatic const sal_uInt16 STC_CharIndex_S2V[] = {");
241 address=0;
242 for (i = 0; i < 0x10; i++) {
243 fprintf(cfp, "\n\t");
244 for (j = 0; j < 0x10; j++) {
245 for (k = 0; k < 0x100; k++) {
246 if (SChinese2VChineseData[((i*0x10)+j)*0x100+k] != 0)
247 break;
249 fprintf(
250 cfp, "0x%04lx, ",
251 sal::static_int_cast< unsigned long >(
252 k < 0x100 ? (address++)*0x100 : 0xFFFF));
255 fprintf(cfp, "\n};\n");
257 fprintf(cfp, "\nstatic const sal_Unicode STC_CharData_S2V[] = {");
259 for (i = 0; i < 0x100; i++) {
260 for (j = 0; j < 0x100; j++) {
261 if (SChinese2VChineseData[i*0x100+j] != 0)
262 break;
264 if (j < 0x100) {
265 for (j = 0; j < 0x10; j++) {
266 fprintf(cfp, "\n\t");
267 for (k = 0; k < 0x10; k++) {
268 sal_Unicode c = SChinese2VChineseData[((i*0x10+j)*0x10)+k];
269 fprintf(cfp, "0x%04x, ", c ? c : 0xFFFF);
274 fprintf(cfp, "\n};\n");
276 fprintf(cfp, "\nstatic const sal_uInt16 STC_CharIndex_T2S[] = {");
278 address=0;
279 for (i = 0; i < 0x10; i++) {
280 fprintf(cfp, "\n\t");
281 for (j = 0; j < 0x10; j++) {
282 for (k = 0; k < 0x100; k++) {
283 if (TChinese2SChineseData[((i*0x10)+j)*0x100+k] != 0)
284 break;
286 fprintf(
287 cfp, "0x%04lx, ",
288 sal::static_int_cast< unsigned long >(
289 k < 0x100 ? (address++)*0x100 : 0xFFFF));
292 fprintf(cfp, "\n};\n");
294 fprintf(cfp, "\nstatic const sal_Unicode STC_CharData_T2S[] = {");
296 for (i = 0; i < 0x100; i++) {
297 for (j = 0; j < 0x100; j++) {
298 if (TChinese2SChineseData[i*0x100+j] != 0)
299 break;
301 if (j < 0x100) {
302 for (j = 0; j < 0x10; j++) {
303 fprintf(cfp, "\n\t");
304 for (k = 0; k < 0x10; k++) {
305 sal_Unicode c = TChinese2SChineseData[((i*0x10+j)*0x10)+k];
306 fprintf(cfp, "0x%04x, ", c ? c : 0xFFFF);
311 fprintf(cfp, "\n};\n");
313 // create function to return arrays
314 fprintf (cfp, "\tconst sal_uInt16* getSTC_CharIndex_S2T() { return STC_CharIndex_S2T; }\n");
315 fprintf (cfp, "\tconst sal_Unicode* getSTC_CharData_S2T() { return STC_CharData_S2T; }\n");
316 fprintf (cfp, "\tconst sal_uInt16* getSTC_CharIndex_S2V() { return STC_CharIndex_S2V; }\n");
317 fprintf (cfp, "\tconst sal_Unicode* getSTC_CharData_S2V() { return STC_CharData_S2V; }\n");
318 fprintf (cfp, "\tconst sal_uInt16* getSTC_CharIndex_T2S() { return STC_CharIndex_T2S; }\n");
319 fprintf (cfp, "\tconst sal_Unicode* getSTC_CharData_T2S() { return STC_CharData_T2S; }\n");
323 struct Index {
324 sal_uInt16 address;
325 sal_Int32 len;
326 sal_Unicode *data;
329 extern "C" {
330 static int Index_comp(const void* s1, const void* s2)
332 Index const *p1 = static_cast<Index const *>(s1), *p2 = static_cast<Index const *>(s2);
333 int result = p1->len - p2->len;
334 for (int i = 0; result == 0 && i < p1->len; i++)
335 result = *(p1->data+i) - *(p2->data+i);
336 return result;
340 // Simplified/Traditional Chinese word conversion
341 void make_stc_word(FILE *sfp, FILE *cfp)
343 sal_Int32 count, i, length;
344 sal_Unicode STC_WordData[0x10000];
345 std::vector<Index> STC_WordEntry_S2T(0x10000);
346 std::vector<Index> STC_WordEntry_T2S(0x10000);
347 sal_Int32 count_S2T = 0, count_T2S = 0;
348 sal_Int32 line = 0, char_total = 0;
349 sal_Char Cstr[1024];
351 while (fgets(Cstr, 1024, sfp)) {
352 // input file is in UTF-8 encoding (SChinese:TChinese)
353 // don't convert last new line character to Ostr.
354 OUString Ostr(Cstr, strlen(Cstr) - 1, RTL_TEXTENCODING_UTF8);
355 sal_Int32 len = Ostr.getLength();
356 if (char_total + len + 1 > 0xFFFF) {
357 fprintf(stderr, "Word Dictionary stc_word.dic is too big (line %ld)", sal::static_int_cast< long >(line));
358 return;
360 sal_Int32 sep=-1, eq=-1, gt=-1, lt=-1;
361 if (((sep = eq = Ostr.indexOf('=')) > 0) ||
362 ((sep = gt = Ostr.indexOf('>')) > 0) ||
363 ((sep = lt = Ostr.indexOf('<')) > 0)) {
365 if (eq > 0 || gt > 0) {
366 STC_WordEntry_S2T[count_S2T].address = sal::static_int_cast<sal_uInt16>( char_total );
367 STC_WordEntry_S2T[count_S2T].len = sep;
368 STC_WordEntry_S2T[count_S2T++].data = &STC_WordData[char_total];
370 if (eq > 0 || lt > 0) {
371 STC_WordEntry_T2S[count_T2S].address = sal::static_int_cast<sal_uInt16>( char_total + sep + 1 );
372 STC_WordEntry_T2S[count_T2S].len = len - sep - 1;
373 STC_WordEntry_T2S[count_T2S++].data = &STC_WordData[char_total + sep + 1];
375 for (i = 0; i < len; i++)
376 STC_WordData[char_total++] = (i == sep) ? 0 : Ostr[i];
377 STC_WordData[char_total++] = 0;
378 } else {
379 fprintf(stderr, "Invalid entry in stc_word.dic (line %ld)", sal::static_int_cast< long >(line));
380 return;
382 line++;
385 if (char_total > 0) {
386 fprintf(cfp, "\nstatic const sal_Unicode STC_WordData[] = {");
387 for (i = 0; i < char_total; i++) {
388 if (i % 32 == 0) fprintf(cfp, "\n\t");
389 fprintf(cfp, "0x%04x, ", STC_WordData[i]);
391 fprintf(cfp, "\n};\n");
393 fprintf(cfp, "\nstatic sal_Int32 STC_WordData_Count = %ld;\n", sal::static_int_cast< long >(char_total));
395 // create function to return arrays
396 fprintf (cfp, "\tconst sal_Unicode* getSTC_WordData(sal_Int32& count) { count = STC_WordData_Count; return STC_WordData; }\n");
397 } else {
398 fprintf (cfp, "\tconst sal_Unicode* getSTC_WordData(sal_Int32& count) { count = 0; return NULL; }\n");
401 sal_uInt16 STC_WordIndex[0x100];
403 if (count_S2T > 0) {
404 qsort(STC_WordEntry_S2T.data(), count_S2T, sizeof(Index), Index_comp);
406 fprintf(cfp, "\nstatic const sal_uInt16 STC_WordEntry_S2T[] = {");
407 count = 0;
408 length = 0;
409 for (i = 0; i < count_S2T; i++) {
410 if (i % 32 == 0) fprintf(cfp, "\n\t");
411 fprintf(cfp, "0x%04x, ", STC_WordEntry_S2T[i].address);
412 if (STC_WordEntry_S2T[i].len != length) {
413 length = STC_WordEntry_S2T[i].len;
414 while (count <= length)
415 STC_WordIndex[count++] = sal::static_int_cast<sal_uInt16>(i);
418 fprintf(cfp, "\n};\n");
419 STC_WordIndex[count++] = sal::static_int_cast<sal_uInt16>(i);
421 fprintf(cfp, "\nstatic const sal_uInt16 STC_WordIndex_S2T[] = {");
422 for (i = 0; i < count; i++) {
423 if (i % 16 == 0) fprintf(cfp, "\n\t");
424 fprintf(cfp, "0x%04x, ", STC_WordIndex[i]);
426 fprintf(cfp, "\n};\n");
428 fprintf(cfp, "\nstatic sal_Int32 STC_WordIndex_S2T_Count = %ld;\n", sal::static_int_cast< long >(length));
429 fprintf (cfp, "\tconst sal_uInt16* getSTC_WordEntry_S2T() { return STC_WordEntry_S2T; }\n");
430 fprintf (cfp, "\tconst sal_uInt16* getSTC_WordIndex_S2T(sal_Int32& count) { count = STC_WordIndex_S2T_Count; return STC_WordIndex_S2T; }\n");
431 } else {
432 fprintf (cfp, "\tconst sal_uInt16* getSTC_WordEntry_S2T() { return NULL; }\n");
433 fprintf (cfp, "\tconst sal_uInt16* getSTC_WordIndex_S2T(sal_Int32& count) { count = 0; return NULL; }\n");
436 if (count_T2S > 0) {
437 qsort(STC_WordEntry_T2S.data(), count_T2S, sizeof(Index), Index_comp);
439 fprintf(cfp, "\nstatic const sal_uInt16 STC_WordEntry_T2S[] = {");
440 count = 0;
441 length = 0;
442 for (i = 0; i < count_T2S; i++) {
443 if (i % 32 == 0) fprintf(cfp, "\n\t");
444 fprintf(cfp, "0x%04x, ", STC_WordEntry_T2S[i].address);
445 if (STC_WordEntry_T2S[i].len != length) {
446 length = STC_WordEntry_T2S[i].len;
447 while (count <= length)
448 STC_WordIndex[count++] = sal::static_int_cast<sal_uInt16>(i);
451 STC_WordIndex[count++] = sal::static_int_cast<sal_uInt16>(i);
452 fprintf(cfp, "\n};\n");
454 fprintf(cfp, "\nstatic const sal_uInt16 STC_WordIndex_T2S[] = {");
455 for (i = 0; i < count; i++) {
456 if (i % 16 == 0) fprintf(cfp, "\n\t");
457 fprintf(cfp, "0x%04x, ", STC_WordIndex[i]);
459 fprintf(cfp, "\n};\n");
461 fprintf(cfp, "\nstatic sal_Int32 STC_WordIndex_T2S_Count = %ld;\n\n", sal::static_int_cast< long >(length));
462 fprintf (cfp, "\tconst sal_uInt16* getSTC_WordEntry_T2S() { return STC_WordEntry_T2S; }\n");
463 fprintf (cfp, "\tconst sal_uInt16* getSTC_WordIndex_T2S(sal_Int32& count) { count = STC_WordIndex_T2S_Count; return STC_WordIndex_T2S; }\n");
464 } else {
465 fprintf (cfp, "\tconst sal_uInt16* getSTC_WordEntry_T2S() { return NULL; }\n");
466 fprintf (cfp, "\tconst sal_uInt16* getSTC_WordIndex_T2S(sal_Int32& count) { count = 0; return NULL; }\n");
470 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */