Version 4.0.0.1, tag libreoffice-4.0.0.1
[LibreOffice.git] / i18npool / source / breakiterator / gendict.cxx
blob6c9b65af2debefb516a4c10254c745ded2cc0e50
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
21 #include <stdio.h>
22 #include <string.h>
23 #include <stdlib.h>
24 #include <errno.h>
25 #include <sal/main.h>
26 #include <sal/types.h>
27 #include <rtl/strbuf.hxx>
28 #include <rtl/ustring.hxx>
29 #include <osl/diagnose.h>
30 #include <vector>
31 using std::vector;
33 using namespace ::rtl;
35 /* Utility gendict:
37 "BreakIterator_CJK provides input string caching and dictionary searching for
38 longest matching. You can provide a sorted dictionary (the encoding must be
39 UTF-8) by creating the following file:
40 i18npool/source/breakiterator/data/<language>.dict.
42 The utility gendict will convert the file to C code, which will be compiled
43 into a shared library for dynamic loading.
45 All dictionary searching and loading is performed in the xdictionary class.
46 The only thing you need to do is to derive your class from BreakIterator_CJK
47 and create an instance of the xdictionary with the language name and
48 pass it to the parent class." (from http://wiki.services.openoffice.org/wiki/
49 /Documentation/DevGuide/OfficeDev/Implementing_a_New_Locale - 27/01/2011)
52 // C-standard garantees that static variables are automatically initialized to 0
53 static sal_uInt8 exists[0x2000];
54 static sal_uInt32 charArray[0x10000];
56 static inline void set_exists(sal_uInt32 index)
58 exists[index>>3] |= 1 << (index & 0x07);
61 static inline void printIncludes(FILE* source_fp)
63 fputs("/* !!!The file is generated automatically. DO NOT edit the file manually!!! */\n\n", source_fp);
64 fputs("#include <sal/types.h>\n\n", source_fp);
67 static inline void printFunctions(FILE* source_fp, const char *lang)
69 fputs ("#ifndef DISABLE_DYNLOADING\n", source_fp);
70 fputs ("SAL_DLLPUBLIC_EXPORT const sal_uInt8* getExistMark() { return existMark; }\n", source_fp);
71 fputs ("SAL_DLLPUBLIC_EXPORT const sal_Int16* getIndex1() { return index1; }\n", source_fp);
72 fputs ("SAL_DLLPUBLIC_EXPORT const sal_Int32* getIndex2() { return index2; }\n", source_fp);
73 fputs ("SAL_DLLPUBLIC_EXPORT const sal_Int32* getLenArray() { return lenArray; }\n", source_fp);
74 fputs ("SAL_DLLPUBLIC_EXPORT const sal_Unicode* getDataArea() { return dataArea; }\n", source_fp);
75 fputs ("#else\n", source_fp);
76 fprintf (source_fp, "SAL_DLLPUBLIC_EXPORT const sal_uInt8* getExistMark_%s() { return existMark; }\n", lang);
77 fprintf (source_fp, "SAL_DLLPUBLIC_EXPORT const sal_Int16* getIndex1_%s() { return index1; }\n", lang);
78 fprintf (source_fp, "SAL_DLLPUBLIC_EXPORT const sal_Int32* getIndex2_%s() { return index2; }\n", lang);
79 fprintf (source_fp, "SAL_DLLPUBLIC_EXPORT const sal_Int32* getLenArray_%s() { return lenArray; }\n", lang);
80 fprintf (source_fp, "SAL_DLLPUBLIC_EXPORT const sal_Unicode* getDataArea_%s() { return dataArea; }\n", lang);
81 fputs ("#endif\n", source_fp);
84 static inline void printDataArea(FILE *dictionary_fp, FILE *source_fp, vector<sal_uInt32>& lenArray)
86 // generate main dict. data array
87 fputs("static const sal_Unicode dataArea[] = {\n\t", source_fp);
88 sal_Char str[1024];
89 sal_uInt32 lenArrayCurr = 0;
90 sal_Unicode current = 0;
92 while (fgets(str, 1024, dictionary_fp)) {
93 // input file is in UTF-8 encoding
94 // don't convert last new line character to Ostr.
95 OUString Ostr((const sal_Char *)str, strlen(str) - 1, RTL_TEXTENCODING_UTF8);
96 const sal_Unicode *u = Ostr.getStr();
98 const sal_Int32 len = Ostr.getLength();
100 sal_Int32 i=0;
101 Ostr.iterateCodePoints(&i, 1);
102 if (len == i)
103 continue; // skip one character word
105 if (u[0] != current) {
106 OSL_ENSURE( (u[0] > current), "Dictionary file should be sorted");
107 current = u[0];
108 charArray[current] = lenArray.size();
111 lenArray.push_back(lenArrayCurr);
113 set_exists(u[0]);
114 // first character is stored in charArray, so start from second
115 for (i = 1; i < len; i++, lenArrayCurr++) {
116 set_exists(u[i]);
117 fprintf(source_fp, "0x%04x, ", u[i]);
118 if ((lenArrayCurr & 0x0f) == 0x0f)
119 fputs("\n\t", source_fp);
122 lenArray.push_back( lenArrayCurr ); // store last ending pointer
123 charArray[current+1] = lenArray.size();
124 fputs("\n};\n", source_fp);
127 static inline void printLenArray(FILE* source_fp, const vector<sal_uInt32>& lenArray)
129 fprintf(source_fp, "static const sal_Int32 lenArray[] = {\n\t");
130 fprintf(source_fp, "0x%x, ", 0); // insert one slat for skipping 0 in index2 array.
131 for (size_t k = 0; k < lenArray.size(); k++)
133 if( !(k & 0xf) )
134 fputs("\n\t", source_fp);
136 fprintf(source_fp, "0x%lx, ", static_cast<long unsigned int>(lenArray[k]));
138 fputs("\n};\n", source_fp );
141 /* FIXME?: what happens if in every range i there is at least one charArray != 0
142 => this will make index1[] = {0x00, 0x01, 0x02,... 0xfe, 0xff }
143 => then in index2, the last range will be ignored incorrectly */
144 static inline void printIndex1(FILE *source_fp, sal_Int16 *set)
146 fprintf (source_fp, "static const sal_Int16 index1[] = {\n\t");
147 sal_Int16 count = 0;
148 for (sal_Int32 i = 0; i < 0x100; i++) {
149 sal_Int32 j = 0;
150 while( j < 0x100 && charArray[(i<<8) + j] == 0)
151 j++;
153 fprintf(source_fp, "0x%02x, ", set[i] = (j < 0x100 ? count++ : 0xff));
154 if ((i & 0x0f) == 0x0f)
155 fputs ("\n\t", source_fp);
157 fputs("};\n", source_fp);
160 static inline void printIndex2(FILE *source_fp, sal_Int16 *set)
162 fputs ("static const sal_Int32 index2[] = {\n\t", source_fp);
163 sal_Int32 prev = 0;
164 for (sal_Int32 i = 0; i < 0x100; i++) {
165 if (set[i] != 0xff) {
166 for (sal_Int32 j = 0; j < 0x100; j++) {
167 sal_Int32 k = (i<<8) + j;
168 if (prev != 0 )
169 while( k < 0x10000 && charArray[k] == 0 )
170 k++;
172 prev = charArray[(i<<8) + j];
173 fprintf(source_fp, "0x%lx, ", static_cast<long unsigned int>(k < 0x10000 ? charArray[k] + 1 : 0));
174 if ((j & 0x0f) == 0x0f)
175 fputs ("\n\t", source_fp);
177 fputs ("\n\t", source_fp);
180 fputs ("\n};\n", source_fp);
183 /* Generates a bitmask for the existance of sal_Unicode values in dictionary;
184 it packs 8 sal_Bool values in 1 sal_uInt8 */
185 static inline void printExistsMask(FILE *source_fp)
187 fprintf (source_fp, "static const sal_uInt8 existMark[] = {\n\t");
188 for (unsigned int i = 0; i < 0x2000; i++)
190 fprintf(source_fp, "0x%02x, ", exists[i]);
191 if ( (i & 0xf) == 0xf )
192 fputs("\n\t", source_fp);
194 fputs("\n};\n", source_fp);
197 SAL_IMPLEMENT_MAIN_WITH_ARGS(argc, argv)
199 FILE *dictionary_fp, *source_fp;
201 if (argc == 1 || argc > 4)
203 fputs("3 arguments required: dictionary_file_name source_file_name language_code", stderr);
204 exit(-1);
207 dictionary_fp = fopen(argv[1], "rb"); // open the source file for read;
208 if (dictionary_fp == NULL)
210 fprintf(stderr, "Opening the dictionary source file %s for reading failed: %s\n", argv[1], strerror(errno));
211 exit(1);
214 if(argc == 2)
215 source_fp = stdout;
216 else
218 // create the C source file to write
219 source_fp = fopen(argv[2], "wb");
220 if (source_fp == NULL) {
221 fclose(dictionary_fp);
222 fprintf(stderr, "Opening %s for writing failed: %s\n", argv[2], strerror(errno));
223 exit(1);
227 vector<sal_uInt32> lenArray; // stores the word boundaries in DataArea
228 sal_Int16 set[0x100];
230 printIncludes(source_fp);
231 fputs("extern \"C\" {\n", source_fp);
232 printDataArea(dictionary_fp, source_fp, lenArray);
233 printLenArray(source_fp, lenArray);
234 printIndex1(source_fp, set);
235 printIndex2(source_fp, set);
236 printExistsMask(source_fp);
237 printFunctions(source_fp, argv[3]);
238 fputs("}\n", source_fp);
240 fclose(dictionary_fp);
241 fclose(source_fp);
243 return 0;
246 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */