1 /*************************************************************************
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 * Copyright 2008 by Sun Microsystems, Inc.
7 * OpenOffice.org - a multi-platform office productivity suite
9 * $RCSfile: genconv_dict.cxx,v $
10 * $Revision: 1.12.22.1 $
12 * This file is part of OpenOffice.org.
14 * OpenOffice.org is free software: you can redistribute it and/or modify
15 * it under the terms of the GNU Lesser General Public License version 3
16 * only, as published by the Free Software Foundation.
18 * OpenOffice.org is distributed in the hope that it will be useful,
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 * GNU Lesser General Public License version 3 for more details
22 * (a copy is included in the LICENSE file that accompanied this code).
24 * You should have received a copy of the GNU Lesser General Public License
25 * version 3 along with OpenOffice.org. If not, see
26 * <http://www.openoffice.org/license.html>
27 * for a copy of the LGPLv3 License.
29 ************************************************************************/
31 // MARKER(update_precomp.py): autogen include statement, do not remove
32 #include "precompiled_i18npool.hxx"
38 #include <sal/types.h>
39 #include <rtl/strbuf.hxx>
40 #include <rtl/ustring.hxx>
42 using namespace ::rtl
;
44 void make_hhc_char(FILE *sfp
, FILE *cfp
);
45 void make_stc_char(FILE *sfp
, FILE *cfp
);
46 void make_stc_word(FILE *sfp
, FILE *cfp
);
50 SAL_IMPLEMENT_MAIN_WITH_ARGS(argc
, argv
)
54 if (argc
< 4) exit(-1);
57 sfp
= fopen(argv
[2], "rb"); // open the source file for read;
60 printf("Open the dictionary source file failed.");
64 // create the C source file to write
65 cfp
= fopen(argv
[3], "wb");
68 printf("Can't create the C source file.");
73 fprintf(cfp
, " * Copyright(c) 1999 - 2000, Sun Microsystems, Inc.\n");
74 fprintf(cfp
, " * All Rights Reserved.\n");
75 fprintf(cfp
, " */\n\n");
76 fprintf(cfp
, "/* !!!The file is generated automatically. DONOT edit the file manually!!! */\n\n");
77 fprintf(cfp
, "#include <sal/types.h>\n");
78 fprintf(cfp
, "#include <textconversion.hxx>\n");
79 fprintf(cfp
, "\nextern \"C\" {\n");
81 if (strcmp(argv
[1], "hhc_char") == 0)
82 make_hhc_char(sfp
, cfp
);
83 else if (strcmp(argv
[1], "stc_char") == 0)
84 make_stc_char(sfp
, cfp
);
85 else if (strcmp(argv
[1], "stc_word") == 0)
86 make_stc_word(sfp
, cfp
);
96 // Hangul/Hanja character conversion
97 void make_hhc_char(FILE *sfp
, FILE *cfp
)
99 sal_Int32 count
, address
, i
, j
, k
;
100 sal_Unicode Hanja2HangulData
[0x10000];
101 for (i
= 0; i
< 0x10000; i
++) {
102 Hanja2HangulData
[i
] = 0;
104 sal_uInt16 Hangul2HanjaData
[10000][3];
106 // generate main dict. data array
107 fprintf(cfp
, "\nstatic const sal_Unicode Hangul2HanjaData[] = {");
112 while (fgets(Cstr
, 1024, sfp
)) {
113 // input file is in UTF-8 encoding (Hangul:Hanja)
114 // don't convert last new line character to Ostr.
115 OUString
Ostr((const sal_Char
*)Cstr
, strlen(Cstr
) - 1, RTL_TEXTENCODING_UTF8
);
116 const sal_Unicode
*Ustr
= Ostr
.getStr();
117 sal_Int32 len
= Ostr
.getLength();
119 Hangul2HanjaData
[count
][0] = Ustr
[0];
120 Hangul2HanjaData
[count
][1] = sal::static_int_cast
<sal_uInt16
>( address
);
121 Hangul2HanjaData
[count
][2] = sal::static_int_cast
<sal_uInt16
>( len
- 2 );
124 for (i
= 2; i
< len
; i
++) {
125 Hanja2HangulData
[Ustr
[i
]] = Ustr
[0];
126 if (address
++ % 16 == 0)
127 fprintf(cfp
, "\n\t");
128 fprintf(cfp
, "0x%04x, ", Ustr
[i
]);
131 fprintf(cfp
, "\n};\n");
133 fprintf(cfp
, "\nstatic const com::sun::star::i18n::Hangul_Index Hangul2HanjaIndex[] = {\n");
134 for (i
= 0; i
< count
; i
++)
135 fprintf(cfp
, "\t{ 0x%04x, 0x%04x, 0x%02x },\n",
136 Hangul2HanjaData
[i
][0],
137 Hangul2HanjaData
[i
][1],
138 Hangul2HanjaData
[i
][2]);
139 fprintf(cfp
, "};\n");
141 fprintf(cfp
, "\nstatic const sal_uInt16 Hanja2HangulIndex[] = {");
144 for (i
= 0; i
< 0x10; i
++) {
145 fprintf(cfp
, "\n\t");
146 for (j
= 0; j
< 0x10; j
++) {
147 for (k
= 0; k
< 0x100; k
++) {
148 if (Hanja2HangulData
[((i
*0x10)+j
)*0x100+k
] != 0)
153 sal::static_int_cast
< unsigned long >(
154 k
< 0x100 ? (address
++)*0x100 : 0xFFFF));
157 fprintf(cfp
, "\n};\n");
159 fprintf(cfp
, "\nstatic const sal_Unicode Hanja2HangulData[] = {");
161 for (i
= 0; i
< 0x100; i
++) {
162 for (j
= 0; j
< 0x100; j
++) {
163 if (Hanja2HangulData
[i
*0x100+j
] != 0)
167 for (j
= 0; j
< 0x10; j
++) {
168 fprintf(cfp
, "\n\t");
169 for (k
= 0; k
< 0x10; k
++) {
170 sal_Unicode c
= Hanja2HangulData
[((i
*0x10+j
)*0x10)+k
];
171 fprintf(cfp
, "0x%04x, ", c
? c
: 0xFFFF);
176 fprintf(cfp
, "\n};\n");
178 // create function to return arrays
179 fprintf (cfp
, "\tconst sal_Unicode* getHangul2HanjaData() { return Hangul2HanjaData; }\n");
180 fprintf (cfp
, "\tconst com::sun::star::i18n::Hangul_Index* getHangul2HanjaIndex() { return Hangul2HanjaIndex; }\n");
181 fprintf (cfp
, "\tsal_Int16 getHangul2HanjaIndexCount() { return sizeof(Hangul2HanjaIndex) / sizeof(com::sun::star::i18n::Hangul_Index); }\n");
182 fprintf (cfp
, "\tconst sal_uInt16* getHanja2HangulIndex() { return Hanja2HangulIndex; }\n");
183 fprintf (cfp
, "\tconst sal_Unicode* getHanja2HangulData() { return Hanja2HangulData; }\n");
186 // Simplified/Traditional Chinese character conversion
187 void make_stc_char(FILE *sfp
, FILE *cfp
)
189 sal_Int32 address
, i
, j
, k
;
190 sal_Unicode SChinese2TChineseData
[0x10000];
191 sal_Unicode SChinese2VChineseData
[0x10000];
192 sal_Unicode TChinese2SChineseData
[0x10000];
193 for (i
= 0; i
< 0x10000; i
++) {
194 SChinese2TChineseData
[i
] = 0;
195 SChinese2VChineseData
[i
] = 0;
196 TChinese2SChineseData
[i
] = 0;
200 while (fgets(Cstr
, 1024, sfp
)) {
201 // input file is in UTF-8 encoding (SChinese:TChinese)
202 // don't convert last new line character to Ostr.
203 OUString
Ostr((const sal_Char
*)Cstr
, strlen(Cstr
) - 1, RTL_TEXTENCODING_UTF8
);
204 const sal_Unicode
*Ustr
= Ostr
.getStr();
205 sal_Int32 len
= Ostr
.getLength();
206 if (Ustr
[1] == sal_Unicode('v'))
207 SChinese2VChineseData
[Ustr
[0]] = Ustr
[2];
209 SChinese2TChineseData
[Ustr
[0]] = Ustr
[2];
210 if (SChinese2VChineseData
[Ustr
[0]] == 0)
211 SChinese2VChineseData
[Ustr
[0]] = Ustr
[2];
213 for (i
= 2; i
< len
; i
++)
214 TChinese2SChineseData
[Ustr
[i
]] = Ustr
[0];
217 fprintf(cfp
, "\nstatic const sal_uInt16 STC_CharIndex_S2T[] = {");
220 for (i
= 0; i
< 0x10; i
++) {
221 fprintf(cfp
, "\n\t");
222 for (j
= 0; j
< 0x10; j
++) {
223 for (k
= 0; k
< 0x100; k
++) {
224 if (SChinese2TChineseData
[((i
*0x10)+j
)*0x100+k
] != 0)
229 sal::static_int_cast
< unsigned long >(
230 k
< 0x100 ? (address
++)*0x100 : 0xFFFF));
233 fprintf(cfp
, "\n};\n");
235 fprintf(cfp
, "\nstatic const sal_Unicode STC_CharData_S2T[] = {");
237 for (i
= 0; i
< 0x100; i
++) {
238 for (j
= 0; j
< 0x100; j
++) {
239 if (SChinese2TChineseData
[i
*0x100+j
] != 0)
243 for (j
= 0; j
< 0x10; j
++) {
244 fprintf(cfp
, "\n\t");
245 for (k
= 0; k
< 0x10; k
++) {
246 sal_Unicode c
= SChinese2TChineseData
[((i
*0x10+j
)*0x10)+k
];
247 fprintf(cfp
, "0x%04x, ", c
? c
: 0xFFFF);
252 fprintf(cfp
, "\n};\n");
254 fprintf(cfp
, "\nstatic const sal_uInt16 STC_CharIndex_S2V[] = {");
257 for (i
= 0; i
< 0x10; i
++) {
258 fprintf(cfp
, "\n\t");
259 for (j
= 0; j
< 0x10; j
++) {
260 for (k
= 0; k
< 0x100; k
++) {
261 if (SChinese2VChineseData
[((i
*0x10)+j
)*0x100+k
] != 0)
266 sal::static_int_cast
< unsigned long >(
267 k
< 0x100 ? (address
++)*0x100 : 0xFFFF));
270 fprintf(cfp
, "\n};\n");
272 fprintf(cfp
, "\nstatic const sal_Unicode STC_CharData_S2V[] = {");
274 for (i
= 0; i
< 0x100; i
++) {
275 for (j
= 0; j
< 0x100; j
++) {
276 if (SChinese2VChineseData
[i
*0x100+j
] != 0)
280 for (j
= 0; j
< 0x10; j
++) {
281 fprintf(cfp
, "\n\t");
282 for (k
= 0; k
< 0x10; k
++) {
283 sal_Unicode c
= SChinese2VChineseData
[((i
*0x10+j
)*0x10)+k
];
284 fprintf(cfp
, "0x%04x, ", c
? c
: 0xFFFF);
289 fprintf(cfp
, "\n};\n");
291 fprintf(cfp
, "\nstatic const sal_uInt16 STC_CharIndex_T2S[] = {");
294 for (i
= 0; i
< 0x10; i
++) {
295 fprintf(cfp
, "\n\t");
296 for (j
= 0; j
< 0x10; j
++) {
297 for (k
= 0; k
< 0x100; k
++) {
298 if (TChinese2SChineseData
[((i
*0x10)+j
)*0x100+k
] != 0)
303 sal::static_int_cast
< unsigned long >(
304 k
< 0x100 ? (address
++)*0x100 : 0xFFFF));
307 fprintf(cfp
, "\n};\n");
309 fprintf(cfp
, "\nstatic const sal_Unicode STC_CharData_T2S[] = {");
311 for (i
= 0; i
< 0x100; i
++) {
312 for (j
= 0; j
< 0x100; j
++) {
313 if (TChinese2SChineseData
[i
*0x100+j
] != 0)
317 for (j
= 0; j
< 0x10; j
++) {
318 fprintf(cfp
, "\n\t");
319 for (k
= 0; k
< 0x10; k
++) {
320 sal_Unicode c
= TChinese2SChineseData
[((i
*0x10+j
)*0x10)+k
];
321 fprintf(cfp
, "0x%04x, ", c
? c
: 0xFFFF);
326 fprintf(cfp
, "\n};\n");
328 // create function to return arrays
329 fprintf (cfp
, "\tconst sal_uInt16* getSTC_CharIndex_S2T() { return STC_CharIndex_S2T; }\n");
330 fprintf (cfp
, "\tconst sal_Unicode* getSTC_CharData_S2T() { return STC_CharData_S2T; }\n");
331 fprintf (cfp
, "\tconst sal_uInt16* getSTC_CharIndex_S2V() { return STC_CharIndex_S2V; }\n");
332 fprintf (cfp
, "\tconst sal_Unicode* getSTC_CharData_S2V() { return STC_CharData_S2V; }\n");
333 fprintf (cfp
, "\tconst sal_uInt16* getSTC_CharIndex_T2S() { return STC_CharIndex_T2S; }\n");
334 fprintf (cfp
, "\tconst sal_Unicode* getSTC_CharData_T2S() { return STC_CharData_T2S; }\n");
345 int Index_comp(const void* s1
, const void* s2
)
347 Index
*p1
= (Index
*)s1
, *p2
= (Index
*)s2
;
348 int result
= p1
->len
- p2
->len
;
349 for (int i
= 0; result
== 0 && i
< p1
->len
; i
++)
350 result
= *(p1
->data
+i
) - *(p2
->data
+i
);
355 // Simplified/Traditional Chinese word conversion
356 void make_stc_word(FILE *sfp
, FILE *cfp
)
358 sal_Int32 count
, i
, length
;
359 sal_Unicode STC_WordData
[0x10000];
360 Index
*STC_WordEntry_S2T
= (Index
*) malloc(0x10000 * sizeof(Index
));
361 Index
*STC_WordEntry_T2S
= (Index
*) malloc(0x10000 * sizeof(Index
));
362 sal_Int32 count_S2T
= 0, count_T2S
= 0;
363 sal_Int32 line
= 0, char_total
= 0;
366 while (fgets(Cstr
, 1024, sfp
)) {
367 // input file is in UTF-8 encoding (SChinese:TChinese)
368 // don't convert last new line character to Ostr.
369 OUString
Ostr((const sal_Char
*)Cstr
, strlen(Cstr
) - 1, RTL_TEXTENCODING_UTF8
);
370 sal_Int32 len
= Ostr
.getLength();
371 if (char_total
+ len
+ 1 > 0xFFFF) {
372 fprintf(stderr
, "Word Dictionary stc_word.dic is too big (line %ld)", sal::static_int_cast
< long >(line
));
375 sal_Int32 sep
=-1, eq
=-1, gt
=-1, lt
=-1;
376 if (((sep
= eq
= Ostr
.indexOf(sal_Unicode('='))) > 0) ||
377 ((sep
= gt
= Ostr
.indexOf(sal_Unicode('>'))) > 0) ||
378 ((sep
= lt
= Ostr
.indexOf(sal_Unicode('<'))) > 0)) {
380 if (eq
> 0 || gt
> 0) {
381 STC_WordEntry_S2T
[count_S2T
].address
= sal::static_int_cast
<sal_uInt16
>( char_total
);
382 STC_WordEntry_S2T
[count_S2T
].len
= sep
;
383 STC_WordEntry_S2T
[count_S2T
++].data
= &STC_WordData
[char_total
];
385 if (eq
> 0 || lt
> 0) {
386 STC_WordEntry_T2S
[count_T2S
].address
= sal::static_int_cast
<sal_uInt16
>( char_total
+ sep
+ 1 );
387 STC_WordEntry_T2S
[count_T2S
].len
= len
- sep
- 1;
388 STC_WordEntry_T2S
[count_T2S
++].data
= &STC_WordData
[char_total
+ sep
+ 1];
390 for (i
= 0; i
< len
; i
++)
391 STC_WordData
[char_total
++] = (i
== sep
) ? 0 : Ostr
[i
];
392 STC_WordData
[char_total
++] = 0;
394 fprintf(stderr
, "Invalid entry in stc_word.dic (line %ld)", sal::static_int_cast
< long >(line
));
400 if (char_total
> 0) {
401 fprintf(cfp
, "\nstatic const sal_Unicode STC_WordData[] = {");
402 for (i
= 0; i
< char_total
; i
++) {
403 if (i
% 32 == 0) fprintf(cfp
, "\n\t");
404 fprintf(cfp
, "0x%04x, ", STC_WordData
[i
]);
406 fprintf(cfp
, "\n};\n");
408 fprintf(cfp
, "\nstatic sal_Int32 STC_WordData_Count = %ld;\n", sal::static_int_cast
< long >(char_total
));
410 // create function to return arrays
411 fprintf (cfp
, "\tconst sal_Unicode* getSTC_WordData(sal_Int32& count) { count = STC_WordData_Count; return STC_WordData; }\n");
413 fprintf (cfp
, "\tconst sal_Unicode* getSTC_WordData(sal_Int32& count) { count = 0; return NULL; }\n");
416 sal_uInt16 STC_WordIndex
[0x100];
419 qsort(STC_WordEntry_S2T
, count_S2T
, sizeof(Index
), Index_comp
);
421 fprintf(cfp
, "\nstatic const sal_uInt16 STC_WordEntry_S2T[] = {");
424 for (i
= 0; i
< count_S2T
; i
++) {
425 if (i
% 32 == 0) fprintf(cfp
, "\n\t");
426 fprintf(cfp
, "0x%04x, ", STC_WordEntry_S2T
[i
].address
);
427 if (STC_WordEntry_S2T
[i
].len
!= length
) {
428 length
= STC_WordEntry_S2T
[i
].len
;
429 while (count
<= length
)
430 STC_WordIndex
[count
++] = sal::static_int_cast
<sal_uInt16
>(i
);
433 fprintf(cfp
, "\n};\n");
434 STC_WordIndex
[count
++] = sal::static_int_cast
<sal_uInt16
>(i
);
436 fprintf(cfp
, "\nstatic const sal_uInt16 STC_WordIndex_S2T[] = {");
437 for (i
= 0; i
< count
; i
++) {
438 if (i
% 16 == 0) fprintf(cfp
, "\n\t");
439 fprintf(cfp
, "0x%04x, ", STC_WordIndex
[i
]);
441 fprintf(cfp
, "\n};\n");
443 fprintf(cfp
, "\nstatic sal_Int32 STC_WordIndex_S2T_Count = %ld;\n", sal::static_int_cast
< long >(length
));
444 fprintf (cfp
, "\tconst sal_uInt16* getSTC_WordEntry_S2T() { return STC_WordEntry_S2T; }\n");
445 fprintf (cfp
, "\tconst sal_uInt16* getSTC_WordIndex_S2T(sal_Int32& count) { count = STC_WordIndex_S2T_Count; return STC_WordIndex_S2T; }\n");
447 fprintf (cfp
, "\tconst sal_uInt16* getSTC_WordEntry_S2T() { return NULL; }\n");
448 fprintf (cfp
, "\tconst sal_uInt16* getSTC_WordIndex_S2T(sal_Int32& count) { count = 0; return NULL; }\n");
452 qsort(STC_WordEntry_T2S
, count_T2S
, sizeof(Index
), Index_comp
);
454 fprintf(cfp
, "\nstatic const sal_uInt16 STC_WordEntry_T2S[] = {");
457 for (i
= 0; i
< count_T2S
; i
++) {
458 if (i
% 32 == 0) fprintf(cfp
, "\n\t");
459 fprintf(cfp
, "0x%04x, ", STC_WordEntry_T2S
[i
].address
);
460 if (STC_WordEntry_T2S
[i
].len
!= length
) {
461 length
= STC_WordEntry_T2S
[i
].len
;
462 while (count
<= length
)
463 STC_WordIndex
[count
++] = sal::static_int_cast
<sal_uInt16
>(i
);
466 STC_WordIndex
[count
++] = sal::static_int_cast
<sal_uInt16
>(i
);
467 fprintf(cfp
, "\n};\n");
469 fprintf(cfp
, "\nstatic const sal_uInt16 STC_WordIndex_T2S[] = {");
470 for (i
= 0; i
< count
; i
++) {
471 if (i
% 16 == 0) fprintf(cfp
, "\n\t");
472 fprintf(cfp
, "0x%04x, ", STC_WordIndex
[i
]);
474 fprintf(cfp
, "\n};\n");
476 fprintf(cfp
, "\nstatic sal_Int32 STC_WordIndex_T2S_Count = %ld;\n\n", sal::static_int_cast
< long >(length
));
477 fprintf (cfp
, "\tconst sal_uInt16* getSTC_WordEntry_T2S() { return STC_WordEntry_T2S; }\n");
478 fprintf (cfp
, "\tconst sal_uInt16* getSTC_WordIndex_T2S(sal_Int32& count) { count = STC_WordIndex_T2S_Count; return STC_WordIndex_T2S; }\n");
480 fprintf (cfp
, "\tconst sal_uInt16* getSTC_WordEntry_T2S() { return NULL; }\n");
481 fprintf (cfp
, "\tconst sal_uInt16* getSTC_WordIndex_T2S(sal_Int32& count) { count = 0; return NULL; }\n");
483 free(STC_WordEntry_S2T
);
484 free(STC_WordEntry_T2S
);