Bump version to 6.0-36
[LibreOffice.git] / i18npool / source / breakiterator / gendict.cxx
blob677ea49812afab90d899a60e78aa66adc4ca6c66
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include <stdio.h>
21 #include <string.h>
22 #include <stdlib.h>
23 #include <errno.h>
24 #include <sal/main.h>
25 #include <sal/types.h>
26 #include <rtl/strbuf.hxx>
27 #include <rtl/ustring.hxx>
28 #include <osl/diagnose.h>
29 #include <vector>
31 using std::vector;
34 // For iOS, where we must strive for a minimal executable size, we
35 // keep the data produced by this utility not as large const tables in
36 // source code but instead as separate data files, to be bundled with
37 // an app, and mmapped in at run time.
39 // To test this easier on a desktop OS, just make sure
40 // DICT_JA_ZH_IN_DATAFILE is defined when building i18npool.
42 #ifdef DICT_JA_ZH_IN_DATAFILE
43 static sal_Int64 dataAreaOffset = 0;
44 static sal_Int64 lenArrayOffset = 0;
45 static sal_Int64 index1Offset = 0;
46 static sal_Int64 index2Offset = 0;
47 static sal_Int64 existMarkOffset = 0;
48 #endif
50 /* Utility gendict:
52 "BreakIterator_CJK provides input string caching and dictionary searching for
53 longest matching. You can provide a sorted dictionary (the encoding must be
54 UTF-8) by creating the following file:
55 i18npool/source/breakiterator/data/<language>.dict.
57 The utility gendict will convert the file to C code, which will be compiled
58 into a shared library for dynamic loading.
60 All dictionary searching and loading is performed in the xdictionary class.
61 The only thing you need to do is to derive your class from BreakIterator_CJK
62 and create an instance of the xdictionary with the language name and
63 pass it to the parent class." (from http://wiki.openoffice.org/wiki/
64 /Documentation/DevGuide/OfficeDev/Implementing_a_New_Locale - 27/01/2011)
67 // C-standard guarantees that static variables are automatically initialized to 0
68 static sal_uInt8 exists[0x2000];
69 static sal_uInt32 charArray[0x10000];
71 static inline void set_exists(sal_uInt32 index)
73 exists[index>>3] |= 1 << (index & 0x07);
76 static inline void printIncludes(FILE* source_fp)
78 #ifndef DICT_JA_ZH_IN_DATAFILE
79 fputs("/* !!!The file is generated automatically. DO NOT edit the file manually!!! */\n\n", source_fp);
80 fputs("#include <sal/types.h>\n\n", source_fp);
81 #else
82 (void) source_fp;
83 #endif
86 static inline void printFunctions(FILE* source_fp, const char *lang)
88 #ifndef DICT_JA_ZH_IN_DATAFILE
89 fputs ("#ifndef DISABLE_DYNLOADING\n", source_fp);
90 fputs ("SAL_DLLPUBLIC_EXPORT const sal_uInt8* getExistMark() { return existMark; }\n", source_fp);
91 fputs ("SAL_DLLPUBLIC_EXPORT const sal_Int16* getIndex1() { return index1; }\n", source_fp);
92 fputs ("SAL_DLLPUBLIC_EXPORT const sal_Int32* getIndex2() { return index2; }\n", source_fp);
93 fputs ("SAL_DLLPUBLIC_EXPORT const sal_Int32* getLenArray() { return lenArray; }\n", source_fp);
94 fputs ("SAL_DLLPUBLIC_EXPORT const sal_Unicode* getDataArea() { return dataArea; }\n", source_fp);
95 fputs ("#else\n", source_fp);
96 fprintf (source_fp, "SAL_DLLPUBLIC_EXPORT const sal_uInt8* getExistMark_%s() { return existMark; }\n", lang);
97 fprintf (source_fp, "SAL_DLLPUBLIC_EXPORT const sal_Int16* getIndex1_%s() { return index1; }\n", lang);
98 fprintf (source_fp, "SAL_DLLPUBLIC_EXPORT const sal_Int32* getIndex2_%s() { return index2; }\n", lang);
99 fprintf (source_fp, "SAL_DLLPUBLIC_EXPORT const sal_Int32* getLenArray_%s() { return lenArray; }\n", lang);
100 fprintf (source_fp, "SAL_DLLPUBLIC_EXPORT const sal_Unicode* getDataArea_%s() { return dataArea; }\n", lang);
101 fputs ("#endif\n", source_fp);
102 #else
103 (void) source_fp;
104 (void) lang;
105 #endif
108 static inline void printDataArea(FILE *dictionary_fp, FILE *source_fp, vector<sal_uInt32>& lenArray)
110 // generate main dict. data array
111 #ifndef DICT_JA_ZH_IN_DATAFILE
112 fputs("static const sal_Unicode dataArea[] = {\n\t", source_fp);
113 #else
114 dataAreaOffset = ftell(source_fp);
115 #endif
116 sal_Char str[1024];
117 sal_uInt32 lenArrayCurr = 0;
118 sal_Unicode current = 0;
120 while (fgets(str, 1024, dictionary_fp)) {
121 // input file is in UTF-8 encoding
122 // don't convert last new line character to Ostr.
123 OUString Ostr(str, strlen(str) - 1, RTL_TEXTENCODING_UTF8);
125 const sal_Int32 len = Ostr.getLength();
127 sal_Int32 i=0;
128 Ostr.iterateCodePoints(&i);
129 if (len == i)
130 continue; // skip one character word
132 if (Ostr[0] != current) {
133 OSL_ENSURE( (Ostr[0] > current), "Dictionary file should be sorted");
134 current = Ostr[0];
135 charArray[current] = lenArray.size();
138 lenArray.push_back(lenArrayCurr);
140 set_exists(Ostr[0]);
141 // first character is stored in charArray, so start from second
142 for (i = 1; i < len; i++, lenArrayCurr++) {
143 set_exists(Ostr[i]);
144 #ifndef DICT_JA_ZH_IN_DATAFILE
145 fprintf(source_fp, "0x%04x, ", Ostr[i]);
146 if ((lenArrayCurr & 0x0f) == 0x0f)
147 fputs("\n\t", source_fp);
148 #else
149 sal_Unicode x = Ostr[i];
150 fwrite(&x, sizeof(Ostr[i]), 1, source_fp);
151 #endif
154 charArray[current+1] = lenArray.size();
155 lenArray.push_back( lenArrayCurr ); // store last ending pointer
156 #ifndef DICT_JA_ZH_IN_DATAFILE
157 fputs("\n};\n", source_fp);
158 #endif
161 static inline void printLenArray(FILE* source_fp, const vector<sal_uInt32>& lenArray)
163 #ifndef DICT_JA_ZH_IN_DATAFILE
164 fprintf(source_fp, "static const sal_Int32 lenArray[] = {\n\t");
165 fprintf(source_fp, "0x%x, ", 0); // insert one slat for skipping 0 in index2 array.
166 #else
167 lenArrayOffset = ftell(source_fp);
168 sal_uInt32 zero(0);
169 fwrite(&zero, sizeof(zero), 1, source_fp);
170 #endif
171 for (size_t k = 0; k < lenArray.size(); k++)
173 if( !(k & 0xf) )
174 fputs("\n\t", source_fp);
176 #ifndef DICT_JA_ZH_IN_DATAFILE
177 fprintf(source_fp, "0x%lx, ", static_cast<long unsigned int>(lenArray[k]));
178 #else
179 fwrite(&lenArray[k], sizeof(lenArray[k]), 1, source_fp);
180 #endif
183 #ifndef DICT_JA_ZH_IN_DATAFILE
184 fputs("\n};\n", source_fp );
185 #endif
188 /* FIXME?: what happens if in every range i there is at least one charArray != 0
189 => this will make index1[] = {0x00, 0x01, 0x02,... 0xfe, 0xff }
190 => then in index2, the last range will be ignored incorrectly */
191 static inline void printIndex1(FILE *source_fp, sal_Int16 *set)
193 #ifndef DICT_JA_ZH_IN_DATAFILE
194 fprintf (source_fp, "static const sal_Int16 index1[] = {\n\t");
195 #else
196 index1Offset = ftell(source_fp);
197 #endif
199 sal_Int16 count = 0;
200 for (sal_Int32 i = 0; i < 0x100; i++) {
201 sal_Int32 j = 0;
202 while( j < 0x100 && charArray[(i<<8) + j] == 0)
203 j++;
205 set[i] = (j < 0x100 ? count++ : 0xff);
206 #ifndef DICT_JA_ZH_IN_DATAFILE
207 fprintf(source_fp, "0x%02x, ", set[i]);
208 if ((i & 0x0f) == 0x0f)
209 fputs ("\n\t", source_fp);
210 #else
211 fwrite(&set[i], sizeof(set[i]), 1, source_fp);
212 #endif
215 #ifndef DICT_JA_ZH_IN_DATAFILE
216 fputs("};\n", source_fp);
217 #endif
220 static inline void printIndex2(FILE *source_fp, sal_Int16 const *set)
222 #ifndef DICT_JA_ZH_IN_DATAFILE
223 fputs ("static const sal_Int32 index2[] = {\n\t", source_fp);
224 #else
225 index2Offset = ftell(source_fp);
226 #endif
227 sal_Int32 prev = 0;
228 for (sal_Int32 i = 0; i < 0x100; i++) {
229 if (set[i] != 0xff) {
230 for (sal_Int32 j = 0; j < 0x100; j++) {
231 sal_Int32 k = (i<<8) + j;
232 if (prev != 0 )
233 while( k < 0x10000 && charArray[k] == 0 )
234 k++;
236 prev = charArray[(i<<8) + j];
237 #ifndef DICT_JA_ZH_IN_DATAFILE
238 fprintf(source_fp, "0x%lx, ", static_cast<long unsigned int>(k < 0x10000 ? charArray[k] + 1 : 0));
239 if ((j & 0x0f) == 0x0f)
240 fputs ("\n\t", source_fp);
241 #else
242 sal_uInt32 n = (k < 0x10000 ? charArray[k] + 1 : 0);
243 fwrite(&n, sizeof(n), 1, source_fp);
244 #endif
246 #ifndef DICT_JA_ZH_IN_DATAFILE
247 fputs ("\n\t", source_fp);
248 #endif
251 #ifndef DICT_JA_ZH_IN_DATAFILE
252 fputs ("\n};\n", source_fp);
253 #endif
256 /* Generates a bitmask for the existence of sal_Unicode values in dictionary;
257 it packs 8 sal_Bool values in 1 sal_uInt8 */
258 static inline void printExistsMask(FILE *source_fp)
260 #ifndef DICT_JA_ZH_IN_DATAFILE
261 fprintf (source_fp, "static const sal_uInt8 existMark[] = {\n\t");
262 #else
263 existMarkOffset = ftell(source_fp);
264 #endif
265 for (unsigned int i = 0; i < 0x2000; i++)
267 #ifndef DICT_JA_ZH_IN_DATAFILE
268 fprintf(source_fp, "0x%02x, ", exists[i]);
269 if ( (i & 0xf) == 0xf )
270 fputs("\n\t", source_fp);
271 #else
272 fwrite(&exists[i], sizeof(exists[i]), 1, source_fp);
273 #endif
276 #ifndef DICT_JA_ZH_IN_DATAFILE
277 fputs("\n};\n", source_fp);
278 #endif
281 SAL_IMPLEMENT_MAIN_WITH_ARGS(argc, argv)
283 FILE *dictionary_fp, *source_fp;
285 if (argc == 1 || argc > 4)
287 fputs("3 arguments required: dictionary_file_name source_file_name language_code", stderr);
288 exit(-1);
291 dictionary_fp = fopen(argv[1], "rb"); // open the source file for read;
292 if (dictionary_fp == nullptr)
294 fprintf(stderr, "Opening the dictionary source file %s for reading failed: %s\n", argv[1], strerror(errno));
295 exit(1);
298 if(argc == 2)
299 source_fp = stdout;
300 else
302 // create the C source file to write
303 source_fp = fopen(argv[2], "wb");
304 if (source_fp == nullptr) {
305 fclose(dictionary_fp);
306 fprintf(stderr, "Opening %s for writing failed: %s\n", argv[2], strerror(errno));
307 exit(1);
311 vector<sal_uInt32> lenArray; // stores the word boundaries in DataArea
312 sal_Int16 set[0x100];
314 printIncludes(source_fp);
315 #ifndef DICT_JA_ZH_IN_DATAFILE
316 fputs("extern \"C\" {\n", source_fp);
317 #endif
318 printDataArea(dictionary_fp, source_fp, lenArray);
319 printLenArray(source_fp, lenArray);
320 printIndex1(source_fp, set);
321 printIndex2(source_fp, set);
322 printExistsMask(source_fp);
323 printFunctions(source_fp, argv[3]);
324 #ifndef DICT_JA_ZH_IN_DATAFILE
325 fputs("}\n", source_fp);
326 #else
327 // Put pointers to the tables at the end of the file...
328 fwrite(&dataAreaOffset, sizeof(dataAreaOffset), 1, source_fp);
329 fwrite(&lenArrayOffset, sizeof(lenArrayOffset), 1, source_fp);
330 fwrite(&index1Offset, sizeof(index1Offset), 1, source_fp);
331 fwrite(&index2Offset, sizeof(index2Offset), 1, source_fp);
332 fwrite(&existMarkOffset, sizeof(existMarkOffset), 1, source_fp);
333 #endif
335 fclose(dictionary_fp);
336 fclose(source_fp);
338 return 0;
341 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */