1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
25 #include <sal/types.h>
26 #include <rtl/strbuf.hxx>
27 #include <rtl/ustring.hxx>
28 #include <osl/diagnose.h>
34 // For iOS, where we must strive for a minimal executable size, we
35 // keep the data produced by this utility not as large const tables in
36 // source code but instead as separate data files, to be bundled with
37 // an app, and mmapped in at run time.
39 // To test this easier on a desktop OS, just make sure
40 // DICT_JA_ZH_IN_DATAFILE is defined when building i18npool.
42 #ifdef DICT_JA_ZH_IN_DATAFILE
43 static sal_Int64 dataAreaOffset
= 0;
44 static sal_Int64 lenArrayOffset
= 0;
45 static sal_Int64 index1Offset
= 0;
46 static sal_Int64 index2Offset
= 0;
47 static sal_Int64 existMarkOffset
= 0;
52 "BreakIterator_CJK provides input string caching and dictionary searching for
53 longest matching. You can provide a sorted dictionary (the encoding must be
54 UTF-8) by creating the following file:
55 i18npool/source/breakiterator/data/<language>.dict.
57 The utility gendict will convert the file to C code, which will be compiled
58 into a shared library for dynamic loading.
60 All dictionary searching and loading is performed in the xdictionary class.
61 The only thing you need to do is to derive your class from BreakIterator_CJK
62 and create an instance of the xdictionary with the language name and
63 pass it to the parent class." (from http://wiki.openoffice.org/wiki/
64 /Documentation/DevGuide/OfficeDev/Implementing_a_New_Locale - 27/01/2011)
67 // C-standard guarantees that static variables are automatically initialized to 0
68 static sal_uInt8 exists
[0x2000];
69 static sal_uInt32 charArray
[0x10000];
71 static inline void set_exists(sal_uInt32 index
)
73 exists
[index
>>3] |= 1 << (index
& 0x07);
76 static inline void printIncludes(FILE* source_fp
)
78 #ifndef DICT_JA_ZH_IN_DATAFILE
79 fputs("/* !!!The file is generated automatically. DO NOT edit the file manually!!! */\n\n", source_fp
);
80 fputs("#include <sal/types.h>\n\n", source_fp
);
86 static inline void printFunctions(FILE* source_fp
, const char *lang
)
88 #ifndef DICT_JA_ZH_IN_DATAFILE
89 fputs ("#ifndef DISABLE_DYNLOADING\n", source_fp
);
90 fputs ("SAL_DLLPUBLIC_EXPORT const sal_uInt8* getExistMark() { return existMark; }\n", source_fp
);
91 fputs ("SAL_DLLPUBLIC_EXPORT const sal_Int16* getIndex1() { return index1; }\n", source_fp
);
92 fputs ("SAL_DLLPUBLIC_EXPORT const sal_Int32* getIndex2() { return index2; }\n", source_fp
);
93 fputs ("SAL_DLLPUBLIC_EXPORT const sal_Int32* getLenArray() { return lenArray; }\n", source_fp
);
94 fputs ("SAL_DLLPUBLIC_EXPORT const sal_Unicode* getDataArea() { return dataArea; }\n", source_fp
);
95 fputs ("#else\n", source_fp
);
96 fprintf (source_fp
, "SAL_DLLPUBLIC_EXPORT const sal_uInt8* getExistMark_%s() { return existMark; }\n", lang
);
97 fprintf (source_fp
, "SAL_DLLPUBLIC_EXPORT const sal_Int16* getIndex1_%s() { return index1; }\n", lang
);
98 fprintf (source_fp
, "SAL_DLLPUBLIC_EXPORT const sal_Int32* getIndex2_%s() { return index2; }\n", lang
);
99 fprintf (source_fp
, "SAL_DLLPUBLIC_EXPORT const sal_Int32* getLenArray_%s() { return lenArray; }\n", lang
);
100 fprintf (source_fp
, "SAL_DLLPUBLIC_EXPORT const sal_Unicode* getDataArea_%s() { return dataArea; }\n", lang
);
101 fputs ("#endif\n", source_fp
);
108 static inline void printDataArea(FILE *dictionary_fp
, FILE *source_fp
, vector
<sal_uInt32
>& lenArray
)
110 // generate main dict. data array
111 #ifndef DICT_JA_ZH_IN_DATAFILE
112 fputs("static const sal_Unicode dataArea[] = {\n\t", source_fp
);
114 dataAreaOffset
= ftell(source_fp
);
117 sal_uInt32 lenArrayCurr
= 0;
118 sal_Unicode current
= 0;
120 while (fgets(str
, 1024, dictionary_fp
)) {
121 // input file is in UTF-8 encoding
122 // don't convert last new line character to Ostr.
123 OUString
Ostr(str
, strlen(str
) - 1, RTL_TEXTENCODING_UTF8
);
125 const sal_Int32 len
= Ostr
.getLength();
128 Ostr
.iterateCodePoints(&i
);
130 continue; // skip one character word
132 if (Ostr
[0] != current
) {
133 OSL_ENSURE( (Ostr
[0] > current
), "Dictionary file should be sorted");
135 charArray
[current
] = lenArray
.size();
138 lenArray
.push_back(lenArrayCurr
);
141 // first character is stored in charArray, so start from second
142 for (i
= 1; i
< len
; i
++, lenArrayCurr
++) {
144 #ifndef DICT_JA_ZH_IN_DATAFILE
145 fprintf(source_fp
, "0x%04x, ", Ostr
[i
]);
146 if ((lenArrayCurr
& 0x0f) == 0x0f)
147 fputs("\n\t", source_fp
);
149 sal_Unicode x
= Ostr
[i
];
150 fwrite(&x
, sizeof(Ostr
[i
]), 1, source_fp
);
154 charArray
[current
+1] = lenArray
.size();
155 lenArray
.push_back( lenArrayCurr
); // store last ending pointer
156 #ifndef DICT_JA_ZH_IN_DATAFILE
157 fputs("\n};\n", source_fp
);
161 static inline void printLenArray(FILE* source_fp
, const vector
<sal_uInt32
>& lenArray
)
163 #ifndef DICT_JA_ZH_IN_DATAFILE
164 fprintf(source_fp
, "static const sal_Int32 lenArray[] = {\n\t");
165 fprintf(source_fp
, "0x%x, ", 0); // insert one slat for skipping 0 in index2 array.
167 lenArrayOffset
= ftell(source_fp
);
169 fwrite(&zero
, sizeof(zero
), 1, source_fp
);
171 for (size_t k
= 0; k
< lenArray
.size(); k
++)
174 fputs("\n\t", source_fp
);
176 #ifndef DICT_JA_ZH_IN_DATAFILE
177 fprintf(source_fp
, "0x%lx, ", static_cast<long unsigned int>(lenArray
[k
]));
179 fwrite(&lenArray
[k
], sizeof(lenArray
[k
]), 1, source_fp
);
183 #ifndef DICT_JA_ZH_IN_DATAFILE
184 fputs("\n};\n", source_fp
);
188 /* FIXME?: what happens if in every range i there is at least one charArray != 0
189 => this will make index1[] = {0x00, 0x01, 0x02,... 0xfe, 0xff }
190 => then in index2, the last range will be ignored incorrectly */
191 static inline void printIndex1(FILE *source_fp
, sal_Int16
*set
)
193 #ifndef DICT_JA_ZH_IN_DATAFILE
194 fprintf (source_fp
, "static const sal_Int16 index1[] = {\n\t");
196 index1Offset
= ftell(source_fp
);
200 for (sal_Int32 i
= 0; i
< 0x100; i
++) {
202 while( j
< 0x100 && charArray
[(i
<<8) + j
] == 0)
205 set
[i
] = (j
< 0x100 ? count
++ : 0xff);
206 #ifndef DICT_JA_ZH_IN_DATAFILE
207 fprintf(source_fp
, "0x%02x, ", set
[i
]);
208 if ((i
& 0x0f) == 0x0f)
209 fputs ("\n\t", source_fp
);
211 fwrite(&set
[i
], sizeof(set
[i
]), 1, source_fp
);
215 #ifndef DICT_JA_ZH_IN_DATAFILE
216 fputs("};\n", source_fp
);
220 static inline void printIndex2(FILE *source_fp
, sal_Int16
const *set
)
222 #ifndef DICT_JA_ZH_IN_DATAFILE
223 fputs ("static const sal_Int32 index2[] = {\n\t", source_fp
);
225 index2Offset
= ftell(source_fp
);
228 for (sal_Int32 i
= 0; i
< 0x100; i
++) {
229 if (set
[i
] != 0xff) {
230 for (sal_Int32 j
= 0; j
< 0x100; j
++) {
231 sal_Int32 k
= (i
<<8) + j
;
233 while( k
< 0x10000 && charArray
[k
] == 0 )
236 prev
= charArray
[(i
<<8) + j
];
237 #ifndef DICT_JA_ZH_IN_DATAFILE
238 fprintf(source_fp
, "0x%lx, ", static_cast<long unsigned int>(k
< 0x10000 ? charArray
[k
] + 1 : 0));
239 if ((j
& 0x0f) == 0x0f)
240 fputs ("\n\t", source_fp
);
242 sal_uInt32 n
= (k
< 0x10000 ? charArray
[k
] + 1 : 0);
243 fwrite(&n
, sizeof(n
), 1, source_fp
);
246 #ifndef DICT_JA_ZH_IN_DATAFILE
247 fputs ("\n\t", source_fp
);
251 #ifndef DICT_JA_ZH_IN_DATAFILE
252 fputs ("\n};\n", source_fp
);
256 /* Generates a bitmask for the existence of sal_Unicode values in dictionary;
257 it packs 8 sal_Bool values in 1 sal_uInt8 */
258 static inline void printExistsMask(FILE *source_fp
)
260 #ifndef DICT_JA_ZH_IN_DATAFILE
261 fprintf (source_fp
, "static const sal_uInt8 existMark[] = {\n\t");
263 existMarkOffset
= ftell(source_fp
);
265 for (unsigned int i
= 0; i
< 0x2000; i
++)
267 #ifndef DICT_JA_ZH_IN_DATAFILE
268 fprintf(source_fp
, "0x%02x, ", exists
[i
]);
269 if ( (i
& 0xf) == 0xf )
270 fputs("\n\t", source_fp
);
272 fwrite(&exists
[i
], sizeof(exists
[i
]), 1, source_fp
);
276 #ifndef DICT_JA_ZH_IN_DATAFILE
277 fputs("\n};\n", source_fp
);
281 SAL_IMPLEMENT_MAIN_WITH_ARGS(argc
, argv
)
283 FILE *dictionary_fp
, *source_fp
;
285 if (argc
== 1 || argc
> 4)
287 fputs("3 arguments required: dictionary_file_name source_file_name language_code", stderr
);
291 dictionary_fp
= fopen(argv
[1], "rb"); // open the source file for read;
292 if (dictionary_fp
== nullptr)
294 fprintf(stderr
, "Opening the dictionary source file %s for reading failed: %s\n", argv
[1], strerror(errno
));
302 // create the C source file to write
303 source_fp
= fopen(argv
[2], "wb");
304 if (source_fp
== nullptr) {
305 fclose(dictionary_fp
);
306 fprintf(stderr
, "Opening %s for writing failed: %s\n", argv
[2], strerror(errno
));
311 vector
<sal_uInt32
> lenArray
; // stores the word boundaries in DataArea
312 sal_Int16 set
[0x100];
314 printIncludes(source_fp
);
315 #ifndef DICT_JA_ZH_IN_DATAFILE
316 fputs("extern \"C\" {\n", source_fp
);
318 printDataArea(dictionary_fp
, source_fp
, lenArray
);
319 printLenArray(source_fp
, lenArray
);
320 printIndex1(source_fp
, set
);
321 printIndex2(source_fp
, set
);
322 printExistsMask(source_fp
);
323 printFunctions(source_fp
, argv
[3]);
324 #ifndef DICT_JA_ZH_IN_DATAFILE
325 fputs("}\n", source_fp
);
327 // Put pointers to the tables at the end of the file...
328 fwrite(&dataAreaOffset
, sizeof(dataAreaOffset
), 1, source_fp
);
329 fwrite(&lenArrayOffset
, sizeof(lenArrayOffset
), 1, source_fp
);
330 fwrite(&index1Offset
, sizeof(index1Offset
), 1, source_fp
);
331 fwrite(&index2Offset
, sizeof(index2Offset
), 1, source_fp
);
332 fwrite(&existMarkOffset
, sizeof(existMarkOffset
), 1, source_fp
);
335 fclose(dictionary_fp
);
341 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */