1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
25 #include <sal/types.h>
26 #include <rtl/ustring.hxx>
27 #include <osl/diagnose.h>
33 // For iOS, where we must strive for a minimal executable size, we
34 // keep the data produced by this utility not as large const tables in
35 // source code but instead as separate data files, to be bundled with
36 // an app, and mmapped in at run time.
38 // To test this easier on a desktop OS, just make sure
39 // DICT_JA_ZH_IN_DATAFILE is defined when building i18npool.
41 #ifdef DICT_JA_ZH_IN_DATAFILE
42 static sal_Int64 dataAreaOffset
= 0;
43 static sal_Int64 lenArrayOffset
= 0;
44 static sal_Int64 index1Offset
= 0;
45 static sal_Int64 index2Offset
= 0;
46 static sal_Int64 existMarkOffset
= 0;
51 "BreakIterator_CJK provides input string caching and dictionary searching for
52 longest matching. You can provide a sorted dictionary (the encoding must be
53 UTF-8) by creating the following file:
54 i18npool/source/breakiterator/data/<language>.dict.
56 The utility gendict will convert the file to C code, which will be compiled
57 into a shared library for dynamic loading.
59 All dictionary searching and loading is performed in the xdictionary class.
60 The only thing you need to do is to derive your class from BreakIterator_CJK
61 and create an instance of the xdictionary with the language name and
62 pass it to the parent class." (from http://wiki.openoffice.org/wiki/
63 /Documentation/DevGuide/OfficeDev/Implementing_a_New_Locale - 27/01/2011)
66 // C-standard guarantees that static variables are automatically initialized to 0
67 static sal_uInt8 exists
[0x2000];
68 static sal_uInt32 charArray
[0x10000];
70 static void set_exists(sal_uInt32 index
)
72 exists
[index
>>3] |= 1 << (index
& 0x07);
75 static void printIncludes(FILE* source_fp
)
77 #ifndef DICT_JA_ZH_IN_DATAFILE
78 fputs("/* !!!The file is generated automatically. DO NOT edit the file manually!!! */\n\n", source_fp
);
79 fputs("#include <sal/types.h>\n\n", source_fp
);
85 static void printFunctions(FILE* source_fp
, const char *lang
)
87 #ifndef DICT_JA_ZH_IN_DATAFILE
88 fputs ("#ifndef DISABLE_DYNLOADING\n", source_fp
);
89 fputs ("SAL_DLLPUBLIC_EXPORT const sal_uInt8* getExistMark() { return existMark; }\n", source_fp
);
90 fputs ("SAL_DLLPUBLIC_EXPORT const sal_Int16* getIndex1() { return index1; }\n", source_fp
);
91 fputs ("SAL_DLLPUBLIC_EXPORT const sal_Int32* getIndex2() { return index2; }\n", source_fp
);
92 fputs ("SAL_DLLPUBLIC_EXPORT const sal_Int32* getLenArray() { return lenArray; }\n", source_fp
);
93 fputs ("SAL_DLLPUBLIC_EXPORT const sal_Unicode* getDataArea() { return dataArea; }\n", source_fp
);
94 fputs ("#else\n", source_fp
);
95 fprintf (source_fp
, "SAL_DLLPUBLIC_EXPORT const sal_uInt8* getExistMark_%s() { return existMark; }\n", lang
);
96 fprintf (source_fp
, "SAL_DLLPUBLIC_EXPORT const sal_Int16* getIndex1_%s() { return index1; }\n", lang
);
97 fprintf (source_fp
, "SAL_DLLPUBLIC_EXPORT const sal_Int32* getIndex2_%s() { return index2; }\n", lang
);
98 fprintf (source_fp
, "SAL_DLLPUBLIC_EXPORT const sal_Int32* getLenArray_%s() { return lenArray; }\n", lang
);
99 fprintf (source_fp
, "SAL_DLLPUBLIC_EXPORT const sal_Unicode* getDataArea_%s() { return dataArea; }\n", lang
);
100 fputs ("#endif\n", source_fp
);
107 static void printDataArea(FILE *dictionary_fp
, FILE *source_fp
, vector
<sal_uInt32
>& lenArray
)
109 // generate main dict. data array
110 #ifndef DICT_JA_ZH_IN_DATAFILE
111 fputs("static const sal_Unicode dataArea[] = {\n\t", source_fp
);
113 dataAreaOffset
= ftell(source_fp
);
116 sal_uInt32 lenArrayCurr
= 0;
117 sal_Unicode current
= 0;
119 while (fgets(str
, 1024, dictionary_fp
)) {
120 // input file is in UTF-8 encoding
121 // don't convert last new line character to Ostr.
122 OUString
Ostr(str
, strlen(str
) - 1, RTL_TEXTENCODING_UTF8
);
124 const sal_Int32 len
= Ostr
.getLength();
127 Ostr
.iterateCodePoints(&i
);
129 continue; // skip one character word
131 if (Ostr
[0] != current
) {
132 OSL_ENSURE( (Ostr
[0] > current
), "Dictionary file should be sorted");
134 charArray
[current
] = lenArray
.size();
137 lenArray
.push_back(lenArrayCurr
);
140 // first character is stored in charArray, so start from second
141 for (i
= 1; i
< len
; i
++, lenArrayCurr
++) {
143 #ifndef DICT_JA_ZH_IN_DATAFILE
144 fprintf(source_fp
, "0x%04x, ", Ostr
[i
]);
145 if ((lenArrayCurr
& 0x0f) == 0x0f)
146 fputs("\n\t", source_fp
);
148 sal_Unicode x
= Ostr
[i
];
149 fwrite(&x
, sizeof(Ostr
[i
]), 1, source_fp
);
153 charArray
[current
+1] = lenArray
.size();
154 lenArray
.push_back( lenArrayCurr
); // store last ending pointer
155 #ifndef DICT_JA_ZH_IN_DATAFILE
156 fputs("\n};\n", source_fp
);
160 static void printLenArray(FILE* source_fp
, const vector
<sal_uInt32
>& lenArray
)
162 #ifndef DICT_JA_ZH_IN_DATAFILE
163 fprintf(source_fp
, "static const sal_Int32 lenArray[] = {\n\t");
164 fprintf(source_fp
, "0x%x, ", 0); // insert one slat for skipping 0 in index2 array.
166 lenArrayOffset
= ftell(source_fp
);
168 fwrite(&zero
, sizeof(zero
), 1, source_fp
);
170 for (size_t k
= 0; k
< lenArray
.size(); k
++)
173 fputs("\n\t", source_fp
);
175 #ifndef DICT_JA_ZH_IN_DATAFILE
176 fprintf(source_fp
, "0x%" SAL_PRIxUINT32
", ", lenArray
[k
]);
178 fwrite(&lenArray
[k
], sizeof(lenArray
[k
]), 1, source_fp
);
182 #ifndef DICT_JA_ZH_IN_DATAFILE
183 fputs("\n};\n", source_fp
);
187 /* FIXME?: what happens if in every range i there is at least one charArray != 0
188 => this will make index1[] = {0x00, 0x01, 0x02,... 0xfe, 0xff }
189 => then in index2, the last range will be ignored incorrectly */
190 static void printIndex1(FILE *source_fp
, sal_Int16
*set
)
192 #ifndef DICT_JA_ZH_IN_DATAFILE
193 fprintf (source_fp
, "static const sal_Int16 index1[] = {\n\t");
195 index1Offset
= ftell(source_fp
);
199 for (sal_Int32 i
= 0; i
< 0x100; i
++) {
201 while( j
< 0x100 && charArray
[(i
<<8) + j
] == 0)
204 set
[i
] = (j
< 0x100 ? count
++ : 0xff);
205 #ifndef DICT_JA_ZH_IN_DATAFILE
206 fprintf(source_fp
, "0x%02x, ", set
[i
]);
207 if ((i
& 0x0f) == 0x0f)
208 fputs ("\n\t", source_fp
);
210 fwrite(&set
[i
], sizeof(set
[i
]), 1, source_fp
);
214 #ifndef DICT_JA_ZH_IN_DATAFILE
215 fputs("};\n", source_fp
);
219 static void printIndex2(FILE *source_fp
, sal_Int16
const *set
)
221 #ifndef DICT_JA_ZH_IN_DATAFILE
222 fputs ("static const sal_Int32 index2[] = {\n\t", source_fp
);
224 index2Offset
= ftell(source_fp
);
227 for (sal_Int32 i
= 0; i
< 0x100; i
++) {
228 if (set
[i
] != 0xff) {
229 for (sal_Int32 j
= 0; j
< 0x100; j
++) {
230 sal_Int32 k
= (i
<<8) + j
;
232 while( k
< 0x10000 && charArray
[k
] == 0 )
235 prev
= charArray
[(i
<<8) + j
];
236 #ifndef DICT_JA_ZH_IN_DATAFILE
237 fprintf(source_fp
, "0x%lx, ", static_cast<long unsigned int>(k
< 0x10000 ? charArray
[k
] + 1 : 0));
238 if ((j
& 0x0f) == 0x0f)
239 fputs ("\n\t", source_fp
);
241 sal_uInt32 n
= (k
< 0x10000 ? charArray
[k
] + 1 : 0);
242 fwrite(&n
, sizeof(n
), 1, source_fp
);
245 #ifndef DICT_JA_ZH_IN_DATAFILE
246 fputs ("\n\t", source_fp
);
250 #ifndef DICT_JA_ZH_IN_DATAFILE
251 fputs ("\n};\n", source_fp
);
255 /* Generates a bitmask for the existence of sal_Unicode values in dictionary;
256 it packs 8 sal_Bool values in 1 sal_uInt8 */
257 static void printExistsMask(FILE *source_fp
)
259 #ifndef DICT_JA_ZH_IN_DATAFILE
260 fprintf (source_fp
, "static const sal_uInt8 existMark[] = {\n\t");
262 existMarkOffset
= ftell(source_fp
);
264 for (unsigned int i
= 0; i
< 0x2000; i
++)
266 #ifndef DICT_JA_ZH_IN_DATAFILE
267 fprintf(source_fp
, "0x%02x, ", exists
[i
]);
268 if ( (i
& 0xf) == 0xf )
269 fputs("\n\t", source_fp
);
271 fwrite(&exists
[i
], sizeof(exists
[i
]), 1, source_fp
);
275 #ifndef DICT_JA_ZH_IN_DATAFILE
276 fputs("\n};\n", source_fp
);
280 SAL_IMPLEMENT_MAIN_WITH_ARGS(argc
, argv
)
282 FILE *dictionary_fp
, *source_fp
;
284 if (argc
== 1 || argc
> 4)
286 fputs("3 arguments required: dictionary_file_name source_file_name language_code", stderr
);
290 dictionary_fp
= fopen(argv
[1], "rb"); // open the source file for read;
291 if (dictionary_fp
== nullptr)
293 fprintf(stderr
, "Opening the dictionary source file %s for reading failed: %s\n", argv
[1], strerror(errno
));
301 // create the C source file to write
302 source_fp
= fopen(argv
[2], "wb");
303 if (source_fp
== nullptr) {
304 fclose(dictionary_fp
);
305 fprintf(stderr
, "Opening %s for writing failed: %s\n", argv
[2], strerror(errno
));
310 vector
<sal_uInt32
> lenArray
; // stores the word boundaries in DataArea
311 sal_Int16 set
[0x100];
313 printIncludes(source_fp
);
314 #ifndef DICT_JA_ZH_IN_DATAFILE
315 fputs("extern \"C\" {\n", source_fp
);
317 printDataArea(dictionary_fp
, source_fp
, lenArray
);
318 printLenArray(source_fp
, lenArray
);
319 printIndex1(source_fp
, set
);
320 printIndex2(source_fp
, set
);
321 printExistsMask(source_fp
);
322 printFunctions(source_fp
, argv
[3]);
323 #ifndef DICT_JA_ZH_IN_DATAFILE
324 fputs("}\n", source_fp
);
326 // Put pointers to the tables at the end of the file...
327 fwrite(&dataAreaOffset
, sizeof(dataAreaOffset
), 1, source_fp
);
328 fwrite(&lenArrayOffset
, sizeof(lenArrayOffset
), 1, source_fp
);
329 fwrite(&index1Offset
, sizeof(index1Offset
), 1, source_fp
);
330 fwrite(&index2Offset
, sizeof(index2Offset
), 1, source_fp
);
331 fwrite(&existMarkOffset
, sizeof(existMarkOffset
), 1, source_fp
);
334 fclose(dictionary_fp
);
340 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */