Version 6.4.0.0.beta1, tag libreoffice-6.4.0.0.beta1
[LibreOffice.git] / i18npool / source / breakiterator / gendict.cxx
blob082ad2416bad19a7fdf400254570563ef1ffb68b
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include <stdio.h>
21 #include <string.h>
22 #include <stdlib.h>
23 #include <errno.h>
24 #include <sal/main.h>
25 #include <sal/types.h>
26 #include <rtl/ustring.hxx>
27 #include <osl/diagnose.h>
28 #include <vector>
30 using std::vector;
33 // For iOS, where we must strive for a minimal executable size, we
34 // keep the data produced by this utility not as large const tables in
35 // source code but instead as separate data files, to be bundled with
36 // an app, and mmapped in at run time.
38 // To test this easier on a desktop OS, just make sure
39 // DICT_JA_ZH_IN_DATAFILE is defined when building i18npool.
41 #ifdef DICT_JA_ZH_IN_DATAFILE
42 static sal_Int64 dataAreaOffset = 0;
43 static sal_Int64 lenArrayOffset = 0;
44 static sal_Int64 index1Offset = 0;
45 static sal_Int64 index2Offset = 0;
46 static sal_Int64 existMarkOffset = 0;
47 #endif
49 /* Utility gendict:
51 "BreakIterator_CJK provides input string caching and dictionary searching for
52 longest matching. You can provide a sorted dictionary (the encoding must be
53 UTF-8) by creating the following file:
54 i18npool/source/breakiterator/data/<language>.dict.
56 The utility gendict will convert the file to C code, which will be compiled
57 into a shared library for dynamic loading.
59 All dictionary searching and loading is performed in the xdictionary class.
60 The only thing you need to do is to derive your class from BreakIterator_CJK
61 and create an instance of the xdictionary with the language name and
62 pass it to the parent class." (from http://wiki.openoffice.org/wiki/
63 /Documentation/DevGuide/OfficeDev/Implementing_a_New_Locale - 27/01/2011)
66 // C-standard guarantees that static variables are automatically initialized to 0
67 static sal_uInt8 exists[0x2000];
68 static sal_uInt32 charArray[0x10000];
70 static void set_exists(sal_uInt32 index)
72 exists[index>>3] |= 1 << (index & 0x07);
75 static void printIncludes(FILE* source_fp)
77 #ifndef DICT_JA_ZH_IN_DATAFILE
78 fputs("/* !!!The file is generated automatically. DO NOT edit the file manually!!! */\n\n", source_fp);
79 fputs("#include <sal/types.h>\n\n", source_fp);
80 #else
81 (void) source_fp;
82 #endif
85 static void printFunctions(FILE* source_fp, const char *lang)
87 #ifndef DICT_JA_ZH_IN_DATAFILE
88 fputs ("#ifndef DISABLE_DYNLOADING\n", source_fp);
89 fputs ("SAL_DLLPUBLIC_EXPORT const sal_uInt8* getExistMark() { return existMark; }\n", source_fp);
90 fputs ("SAL_DLLPUBLIC_EXPORT const sal_Int16* getIndex1() { return index1; }\n", source_fp);
91 fputs ("SAL_DLLPUBLIC_EXPORT const sal_Int32* getIndex2() { return index2; }\n", source_fp);
92 fputs ("SAL_DLLPUBLIC_EXPORT const sal_Int32* getLenArray() { return lenArray; }\n", source_fp);
93 fputs ("SAL_DLLPUBLIC_EXPORT const sal_Unicode* getDataArea() { return dataArea; }\n", source_fp);
94 fputs ("#else\n", source_fp);
95 fprintf (source_fp, "SAL_DLLPUBLIC_EXPORT const sal_uInt8* getExistMark_%s() { return existMark; }\n", lang);
96 fprintf (source_fp, "SAL_DLLPUBLIC_EXPORT const sal_Int16* getIndex1_%s() { return index1; }\n", lang);
97 fprintf (source_fp, "SAL_DLLPUBLIC_EXPORT const sal_Int32* getIndex2_%s() { return index2; }\n", lang);
98 fprintf (source_fp, "SAL_DLLPUBLIC_EXPORT const sal_Int32* getLenArray_%s() { return lenArray; }\n", lang);
99 fprintf (source_fp, "SAL_DLLPUBLIC_EXPORT const sal_Unicode* getDataArea_%s() { return dataArea; }\n", lang);
100 fputs ("#endif\n", source_fp);
101 #else
102 (void) source_fp;
103 (void) lang;
104 #endif
107 static void printDataArea(FILE *dictionary_fp, FILE *source_fp, vector<sal_uInt32>& lenArray)
109 // generate main dict. data array
110 #ifndef DICT_JA_ZH_IN_DATAFILE
111 fputs("static const sal_Unicode dataArea[] = {\n\t", source_fp);
112 #else
113 dataAreaOffset = ftell(source_fp);
114 #endif
115 sal_Char str[1024];
116 sal_uInt32 lenArrayCurr = 0;
117 sal_Unicode current = 0;
119 while (fgets(str, 1024, dictionary_fp)) {
120 // input file is in UTF-8 encoding
121 // don't convert last new line character to Ostr.
122 OUString Ostr(str, strlen(str) - 1, RTL_TEXTENCODING_UTF8);
124 const sal_Int32 len = Ostr.getLength();
126 sal_Int32 i=0;
127 Ostr.iterateCodePoints(&i);
128 if (len == i)
129 continue; // skip one character word
131 if (Ostr[0] != current) {
132 OSL_ENSURE( (Ostr[0] > current), "Dictionary file should be sorted");
133 current = Ostr[0];
134 charArray[current] = lenArray.size();
137 lenArray.push_back(lenArrayCurr);
139 set_exists(Ostr[0]);
140 // first character is stored in charArray, so start from second
141 for (i = 1; i < len; i++, lenArrayCurr++) {
142 set_exists(Ostr[i]);
143 #ifndef DICT_JA_ZH_IN_DATAFILE
144 fprintf(source_fp, "0x%04x, ", Ostr[i]);
145 if ((lenArrayCurr & 0x0f) == 0x0f)
146 fputs("\n\t", source_fp);
147 #else
148 sal_Unicode x = Ostr[i];
149 fwrite(&x, sizeof(Ostr[i]), 1, source_fp);
150 #endif
153 charArray[current+1] = lenArray.size();
154 lenArray.push_back( lenArrayCurr ); // store last ending pointer
155 #ifndef DICT_JA_ZH_IN_DATAFILE
156 fputs("\n};\n", source_fp);
157 #endif
160 static void printLenArray(FILE* source_fp, const vector<sal_uInt32>& lenArray)
162 #ifndef DICT_JA_ZH_IN_DATAFILE
163 fprintf(source_fp, "static const sal_Int32 lenArray[] = {\n\t");
164 fprintf(source_fp, "0x%x, ", 0); // insert one slat for skipping 0 in index2 array.
165 #else
166 lenArrayOffset = ftell(source_fp);
167 sal_uInt32 zero(0);
168 fwrite(&zero, sizeof(zero), 1, source_fp);
169 #endif
170 for (size_t k = 0; k < lenArray.size(); k++)
172 if( !(k & 0xf) )
173 fputs("\n\t", source_fp);
175 #ifndef DICT_JA_ZH_IN_DATAFILE
176 fprintf(source_fp, "0x%" SAL_PRIxUINT32 ", ", lenArray[k]);
177 #else
178 fwrite(&lenArray[k], sizeof(lenArray[k]), 1, source_fp);
179 #endif
182 #ifndef DICT_JA_ZH_IN_DATAFILE
183 fputs("\n};\n", source_fp );
184 #endif
187 /* FIXME?: what happens if in every range i there is at least one charArray != 0
188 => this will make index1[] = {0x00, 0x01, 0x02,... 0xfe, 0xff }
189 => then in index2, the last range will be ignored incorrectly */
190 static void printIndex1(FILE *source_fp, sal_Int16 *set)
192 #ifndef DICT_JA_ZH_IN_DATAFILE
193 fprintf (source_fp, "static const sal_Int16 index1[] = {\n\t");
194 #else
195 index1Offset = ftell(source_fp);
196 #endif
198 sal_Int16 count = 0;
199 for (sal_Int32 i = 0; i < 0x100; i++) {
200 sal_Int32 j = 0;
201 while( j < 0x100 && charArray[(i<<8) + j] == 0)
202 j++;
204 set[i] = (j < 0x100 ? count++ : 0xff);
205 #ifndef DICT_JA_ZH_IN_DATAFILE
206 fprintf(source_fp, "0x%02x, ", set[i]);
207 if ((i & 0x0f) == 0x0f)
208 fputs ("\n\t", source_fp);
209 #else
210 fwrite(&set[i], sizeof(set[i]), 1, source_fp);
211 #endif
214 #ifndef DICT_JA_ZH_IN_DATAFILE
215 fputs("};\n", source_fp);
216 #endif
219 static void printIndex2(FILE *source_fp, sal_Int16 const *set)
221 #ifndef DICT_JA_ZH_IN_DATAFILE
222 fputs ("static const sal_Int32 index2[] = {\n\t", source_fp);
223 #else
224 index2Offset = ftell(source_fp);
225 #endif
226 sal_Int32 prev = 0;
227 for (sal_Int32 i = 0; i < 0x100; i++) {
228 if (set[i] != 0xff) {
229 for (sal_Int32 j = 0; j < 0x100; j++) {
230 sal_Int32 k = (i<<8) + j;
231 if (prev != 0 )
232 while( k < 0x10000 && charArray[k] == 0 )
233 k++;
235 prev = charArray[(i<<8) + j];
236 #ifndef DICT_JA_ZH_IN_DATAFILE
237 fprintf(source_fp, "0x%lx, ", static_cast<long unsigned int>(k < 0x10000 ? charArray[k] + 1 : 0));
238 if ((j & 0x0f) == 0x0f)
239 fputs ("\n\t", source_fp);
240 #else
241 sal_uInt32 n = (k < 0x10000 ? charArray[k] + 1 : 0);
242 fwrite(&n, sizeof(n), 1, source_fp);
243 #endif
245 #ifndef DICT_JA_ZH_IN_DATAFILE
246 fputs ("\n\t", source_fp);
247 #endif
250 #ifndef DICT_JA_ZH_IN_DATAFILE
251 fputs ("\n};\n", source_fp);
252 #endif
255 /* Generates a bitmask for the existence of sal_Unicode values in dictionary;
256 it packs 8 sal_Bool values in 1 sal_uInt8 */
257 static void printExistsMask(FILE *source_fp)
259 #ifndef DICT_JA_ZH_IN_DATAFILE
260 fprintf (source_fp, "static const sal_uInt8 existMark[] = {\n\t");
261 #else
262 existMarkOffset = ftell(source_fp);
263 #endif
264 for (unsigned int i = 0; i < 0x2000; i++)
266 #ifndef DICT_JA_ZH_IN_DATAFILE
267 fprintf(source_fp, "0x%02x, ", exists[i]);
268 if ( (i & 0xf) == 0xf )
269 fputs("\n\t", source_fp);
270 #else
271 fwrite(&exists[i], sizeof(exists[i]), 1, source_fp);
272 #endif
275 #ifndef DICT_JA_ZH_IN_DATAFILE
276 fputs("\n};\n", source_fp);
277 #endif
280 SAL_IMPLEMENT_MAIN_WITH_ARGS(argc, argv)
282 FILE *dictionary_fp, *source_fp;
284 if (argc == 1 || argc > 4)
286 fputs("3 arguments required: dictionary_file_name source_file_name language_code", stderr);
287 exit(-1);
290 dictionary_fp = fopen(argv[1], "rb"); // open the source file for read;
291 if (dictionary_fp == nullptr)
293 fprintf(stderr, "Opening the dictionary source file %s for reading failed: %s\n", argv[1], strerror(errno));
294 exit(1);
297 if(argc == 2)
298 source_fp = stdout;
299 else
301 // create the C source file to write
302 source_fp = fopen(argv[2], "wb");
303 if (source_fp == nullptr) {
304 fclose(dictionary_fp);
305 fprintf(stderr, "Opening %s for writing failed: %s\n", argv[2], strerror(errno));
306 exit(1);
310 vector<sal_uInt32> lenArray; // stores the word boundaries in DataArea
311 sal_Int16 set[0x100];
313 printIncludes(source_fp);
314 #ifndef DICT_JA_ZH_IN_DATAFILE
315 fputs("extern \"C\" {\n", source_fp);
316 #endif
317 printDataArea(dictionary_fp, source_fp, lenArray);
318 printLenArray(source_fp, lenArray);
319 printIndex1(source_fp, set);
320 printIndex2(source_fp, set);
321 printExistsMask(source_fp);
322 printFunctions(source_fp, argv[3]);
323 #ifndef DICT_JA_ZH_IN_DATAFILE
324 fputs("}\n", source_fp);
325 #else
326 // Put pointers to the tables at the end of the file...
327 fwrite(&dataAreaOffset, sizeof(dataAreaOffset), 1, source_fp);
328 fwrite(&lenArrayOffset, sizeof(lenArrayOffset), 1, source_fp);
329 fwrite(&index1Offset, sizeof(index1Offset), 1, source_fp);
330 fwrite(&index2Offset, sizeof(index2Offset), 1, source_fp);
331 fwrite(&existMarkOffset, sizeof(existMarkOffset), 1, source_fp);
332 #endif
334 fclose(dictionary_fp);
335 fclose(source_fp);
337 return 0;
340 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */