1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
21 #include <cclass_unicode.hxx>
22 #include <unicode/uchar.h>
23 #include <rtl/math.hxx>
24 #include <rtl/ustring.hxx>
25 #include <com/sun/star/i18n/KParseTokens.hpp>
26 #include <com/sun/star/i18n/KParseType.hpp>
27 #include <com/sun/star/i18n/UnicodeType.hpp>
28 #include <com/sun/star/i18n/LocaleData.hpp>
29 #include <com/sun/star/i18n/NativeNumberMode.hpp>
30 #include <com/sun/star/i18n/NativeNumberSupplier.hpp>
31 #include <comphelper/processfactory.hxx>
33 #include <string.h> // memcpy()
35 using namespace ::com::sun::star::uno
;
36 using namespace ::com::sun::star::lang
;
37 using namespace ::rtl
;
39 namespace com
{ namespace sun
{ namespace star
{ namespace i18n
{
41 const UPT_FLAG_TYPE
cclass_Unicode::TOKEN_ILLEGAL
= 0x00000000;
42 const UPT_FLAG_TYPE
cclass_Unicode::TOKEN_CHAR
= 0x00000001;
43 const UPT_FLAG_TYPE
cclass_Unicode::TOKEN_CHAR_BOOL
= 0x00000002;
44 const UPT_FLAG_TYPE
cclass_Unicode::TOKEN_CHAR_WORD
= 0x00000004;
45 const UPT_FLAG_TYPE
cclass_Unicode::TOKEN_CHAR_VALUE
= 0x00000008;
46 const UPT_FLAG_TYPE
cclass_Unicode::TOKEN_CHAR_STRING
= 0x00000010;
47 const UPT_FLAG_TYPE
cclass_Unicode::TOKEN_CHAR_DONTCARE
= 0x00000020;
48 const UPT_FLAG_TYPE
cclass_Unicode::TOKEN_BOOL
= 0x00000040;
49 const UPT_FLAG_TYPE
cclass_Unicode::TOKEN_WORD
= 0x00000080;
50 const UPT_FLAG_TYPE
cclass_Unicode::TOKEN_WORD_SEP
= 0x00000100;
51 const UPT_FLAG_TYPE
cclass_Unicode::TOKEN_VALUE
= 0x00000200;
52 const UPT_FLAG_TYPE
cclass_Unicode::TOKEN_VALUE_SEP
= 0x00000400;
53 const UPT_FLAG_TYPE
cclass_Unicode::TOKEN_VALUE_EXP
= 0x00000800;
54 const UPT_FLAG_TYPE
cclass_Unicode::TOKEN_VALUE_SIGN
= 0x00001000;
55 const UPT_FLAG_TYPE
cclass_Unicode::TOKEN_VALUE_EXP_VALUE
= 0x00002000;
56 const UPT_FLAG_TYPE
cclass_Unicode::TOKEN_VALUE_DIGIT
= 0x00004000;
57 const UPT_FLAG_TYPE
cclass_Unicode::TOKEN_NAME_SEP
= 0x20000000;
58 const UPT_FLAG_TYPE
cclass_Unicode::TOKEN_STRING_SEP
= 0x40000000;
59 const UPT_FLAG_TYPE
cclass_Unicode::TOKEN_EXCLUDED
= 0x80000000;
61 #define TOKEN_DIGIT_FLAGS (TOKEN_CHAR_VALUE | TOKEN_VALUE | TOKEN_VALUE_EXP | TOKEN_VALUE_EXP_VALUE | TOKEN_VALUE_DIGIT)
63 // Default identifier/name specification is [A-Za-z_][A-Za-z0-9_]*
65 const sal_uInt8
cclass_Unicode::nDefCnt
= 128;
66 const UPT_FLAG_TYPE
cclass_Unicode::pDefaultParserTable
[ nDefCnt
] =
68 // (...) == Calc formula compiler specific, commented out and modified
70 /* \0 */ TOKEN_EXCLUDED
,
79 /* 9 \t */ TOKEN_CHAR_DONTCARE
| TOKEN_WORD_SEP
| TOKEN_VALUE_SEP
, // (TOKEN_ILLEGAL)
81 /* 11 \v */ TOKEN_CHAR_DONTCARE
| TOKEN_WORD_SEP
| TOKEN_VALUE_SEP
, // (TOKEN_ILLEGAL)
102 /* 32 */ TOKEN_CHAR_DONTCARE
| TOKEN_WORD_SEP
| TOKEN_VALUE_SEP
,
103 /* 33 ! */ TOKEN_CHAR
| TOKEN_WORD_SEP
| TOKEN_VALUE_SEP
,
104 /* 34 " */ TOKEN_CHAR_STRING
| TOKEN_STRING_SEP
,
105 /* 35 # */ TOKEN_CHAR
| TOKEN_WORD_SEP
| TOKEN_VALUE_SEP
, // (TOKEN_WORD_SEP)
106 /* 36 $ */ TOKEN_CHAR
| TOKEN_WORD_SEP
| TOKEN_VALUE_SEP
, // (TOKEN_CHAR_WORD | TOKEN_WORD)
107 /* 37 % */ TOKEN_CHAR
| TOKEN_WORD_SEP
| TOKEN_VALUE_SEP
, // (TOKEN_VALUE)
108 /* 38 & */ TOKEN_CHAR
| TOKEN_WORD_SEP
| TOKEN_VALUE_SEP
,
109 /* 39 ' */ TOKEN_NAME_SEP
,
110 /* 40 ( */ TOKEN_CHAR
| TOKEN_WORD_SEP
| TOKEN_VALUE_SEP
,
111 /* 41 ) */ TOKEN_CHAR
| TOKEN_WORD_SEP
| TOKEN_VALUE_SEP
,
112 /* 42 * */ TOKEN_CHAR
| TOKEN_WORD_SEP
| TOKEN_VALUE_SEP
,
113 /* 43 + */ TOKEN_CHAR
| TOKEN_WORD_SEP
| TOKEN_VALUE_SEP
| TOKEN_VALUE_EXP
| TOKEN_VALUE_SIGN
,
114 /* 44 , */ TOKEN_CHAR
| TOKEN_WORD_SEP
| TOKEN_VALUE_SEP
, // (TOKEN_CHAR_VALUE | TOKEN_VALUE)
115 /* 45 - */ TOKEN_CHAR
| TOKEN_WORD_SEP
| TOKEN_VALUE_SEP
| TOKEN_VALUE_EXP
| TOKEN_VALUE_SIGN
,
116 /* 46 . */ TOKEN_CHAR
| TOKEN_WORD_SEP
| TOKEN_VALUE_SEP
, // (TOKEN_WORD | TOKEN_CHAR_VALUE | TOKEN_VALUE)
117 /* 47 / */ TOKEN_CHAR
| TOKEN_WORD_SEP
| TOKEN_VALUE_SEP
,
118 //for ( i = 48; i < 58; i++ )
119 /* 48 0 */ TOKEN_DIGIT_FLAGS
| TOKEN_WORD
,
120 /* 49 1 */ TOKEN_DIGIT_FLAGS
| TOKEN_WORD
,
121 /* 50 2 */ TOKEN_DIGIT_FLAGS
| TOKEN_WORD
,
122 /* 51 3 */ TOKEN_DIGIT_FLAGS
| TOKEN_WORD
,
123 /* 52 4 */ TOKEN_DIGIT_FLAGS
| TOKEN_WORD
,
124 /* 53 5 */ TOKEN_DIGIT_FLAGS
| TOKEN_WORD
,
125 /* 54 6 */ TOKEN_DIGIT_FLAGS
| TOKEN_WORD
,
126 /* 55 7 */ TOKEN_DIGIT_FLAGS
| TOKEN_WORD
,
127 /* 56 8 */ TOKEN_DIGIT_FLAGS
| TOKEN_WORD
,
128 /* 57 9 */ TOKEN_DIGIT_FLAGS
| TOKEN_WORD
,
129 /* 58 : */ TOKEN_CHAR
| TOKEN_WORD_SEP
| TOKEN_VALUE_SEP
, // (TOKEN_WORD)
130 /* 59 ; */ TOKEN_CHAR
| TOKEN_WORD_SEP
| TOKEN_VALUE_SEP
,
131 /* 60 < */ TOKEN_CHAR_BOOL
| TOKEN_WORD_SEP
| TOKEN_VALUE_SEP
,
132 /* 61 = */ TOKEN_CHAR
| TOKEN_BOOL
| TOKEN_WORD_SEP
| TOKEN_VALUE_SEP
,
133 /* 62 > */ TOKEN_CHAR_BOOL
| TOKEN_BOOL
| TOKEN_WORD_SEP
| TOKEN_VALUE_SEP
,
134 /* 63 ? */ TOKEN_CHAR
| TOKEN_WORD_SEP
| TOKEN_VALUE_SEP
, // (TOKEN_CHAR_WORD | TOKEN_WORD)
135 /* 64 @ */ TOKEN_CHAR
| TOKEN_WORD_SEP
| TOKEN_VALUE_SEP
, // (TOKEN_ILLEGAL // UNUSED)
136 //for ( i = 65; i < 91; i++ )
137 /* 65 A */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
138 /* 66 B */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
139 /* 67 C */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
140 /* 68 D */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
141 /* 69 E */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
142 /* 70 F */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
143 /* 71 G */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
144 /* 72 H */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
145 /* 73 I */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
146 /* 74 J */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
147 /* 75 K */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
148 /* 76 L */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
149 /* 77 M */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
150 /* 78 N */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
151 /* 79 O */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
152 /* 80 P */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
153 /* 81 Q */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
154 /* 82 R */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
155 /* 83 S */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
156 /* 84 T */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
157 /* 85 U */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
158 /* 86 V */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
159 /* 87 W */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
160 /* 88 X */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
161 /* 89 Y */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
162 /* 90 Z */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
163 /* 91 [ */ TOKEN_CHAR
| TOKEN_WORD_SEP
| TOKEN_VALUE_SEP
, // (TOKEN_ILLEGAL // UNUSED)
164 /* 92 \ */ TOKEN_CHAR
| TOKEN_WORD_SEP
| TOKEN_VALUE_SEP
, // (TOKEN_ILLEGAL // UNUSED)
165 /* 93 ] */ TOKEN_CHAR
| TOKEN_WORD_SEP
| TOKEN_VALUE_SEP
, // (TOKEN_ILLEGAL // UNUSED)
166 /* 94 ^ */ TOKEN_CHAR
| TOKEN_WORD_SEP
| TOKEN_VALUE_SEP
,
167 /* 95 _ */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
168 /* 96 ` */ TOKEN_CHAR
| TOKEN_WORD_SEP
| TOKEN_VALUE_SEP
, // (TOKEN_ILLEGAL // UNUSED)
169 //for ( i = 97; i < 123; i++ )
170 /* 97 a */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
171 /* 98 b */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
172 /* 99 c */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
173 /* 100 d */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
174 /* 101 e */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
175 /* 102 f */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
176 /* 103 g */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
177 /* 104 h */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
178 /* 105 i */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
179 /* 106 j */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
180 /* 107 k */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
181 /* 108 l */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
182 /* 109 m */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
183 /* 110 n */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
184 /* 111 o */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
185 /* 112 p */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
186 /* 113 q */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
187 /* 114 r */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
188 /* 115 s */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
189 /* 116 t */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
190 /* 117 u */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
191 /* 118 v */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
192 /* 119 w */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
193 /* 120 x */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
194 /* 121 y */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
195 /* 122 z */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
196 /* 123 { */ TOKEN_CHAR
| TOKEN_WORD_SEP
| TOKEN_VALUE_SEP
, // (TOKEN_ILLEGAL // UNUSED)
197 /* 124 | */ TOKEN_CHAR
| TOKEN_WORD_SEP
| TOKEN_VALUE_SEP
, // (TOKEN_ILLEGAL // UNUSED)
198 /* 125 } */ TOKEN_CHAR
| TOKEN_WORD_SEP
| TOKEN_VALUE_SEP
, // (TOKEN_ILLEGAL // UNUSED)
199 /* 126 ~ */ TOKEN_CHAR
| TOKEN_WORD_SEP
| TOKEN_VALUE_SEP
, // (TOKEN_ILLEGAL // UNUSED)
200 /* 127 */ TOKEN_CHAR
| TOKEN_WORD_SEP
| TOKEN_VALUE_SEP
// (TOKEN_ILLEGAL // UNUSED)
204 const sal_Int32
cclass_Unicode::pParseTokensType
[ nDefCnt
] =
206 /* \0 */ KParseTokens::ASC_OTHER
,
207 KParseTokens::ASC_CONTROL
,
208 KParseTokens::ASC_CONTROL
,
209 KParseTokens::ASC_CONTROL
,
210 KParseTokens::ASC_CONTROL
,
211 KParseTokens::ASC_CONTROL
,
212 KParseTokens::ASC_CONTROL
,
213 KParseTokens::ASC_CONTROL
,
214 KParseTokens::ASC_CONTROL
,
215 /* 9 \t */ KParseTokens::ASC_CONTROL
,
216 KParseTokens::ASC_CONTROL
,
217 /* 11 \v */ KParseTokens::ASC_CONTROL
,
218 KParseTokens::ASC_CONTROL
,
219 KParseTokens::ASC_CONTROL
,
220 KParseTokens::ASC_CONTROL
,
221 KParseTokens::ASC_CONTROL
,
222 KParseTokens::ASC_CONTROL
,
223 KParseTokens::ASC_CONTROL
,
224 KParseTokens::ASC_CONTROL
,
225 KParseTokens::ASC_CONTROL
,
226 KParseTokens::ASC_CONTROL
,
227 KParseTokens::ASC_CONTROL
,
228 KParseTokens::ASC_CONTROL
,
229 KParseTokens::ASC_CONTROL
,
230 KParseTokens::ASC_CONTROL
,
231 KParseTokens::ASC_CONTROL
,
232 KParseTokens::ASC_CONTROL
,
233 KParseTokens::ASC_CONTROL
,
234 KParseTokens::ASC_CONTROL
,
235 KParseTokens::ASC_CONTROL
,
236 KParseTokens::ASC_CONTROL
,
237 KParseTokens::ASC_CONTROL
,
238 /* 32 */ KParseTokens::ASC_OTHER
,
239 /* 33 ! */ KParseTokens::ASC_OTHER
,
240 /* 34 " */ KParseTokens::ASC_OTHER
,
241 /* 35 # */ KParseTokens::ASC_OTHER
,
242 /* 36 $ */ KParseTokens::ASC_DOLLAR
,
243 /* 37 % */ KParseTokens::ASC_OTHER
,
244 /* 38 & */ KParseTokens::ASC_OTHER
,
245 /* 39 ' */ KParseTokens::ASC_OTHER
,
246 /* 40 ( */ KParseTokens::ASC_OTHER
,
247 /* 41 ) */ KParseTokens::ASC_OTHER
,
248 /* 42 * */ KParseTokens::ASC_OTHER
,
249 /* 43 + */ KParseTokens::ASC_OTHER
,
250 /* 44 , */ KParseTokens::ASC_OTHER
,
251 /* 45 - */ KParseTokens::ASC_OTHER
,
252 /* 46 . */ KParseTokens::ASC_DOT
,
253 /* 47 / */ KParseTokens::ASC_OTHER
,
254 //for ( i = 48; i < 58; i++ )
255 /* 48 0 */ KParseTokens::ASC_DIGIT
,
256 /* 49 1 */ KParseTokens::ASC_DIGIT
,
257 /* 50 2 */ KParseTokens::ASC_DIGIT
,
258 /* 51 3 */ KParseTokens::ASC_DIGIT
,
259 /* 52 4 */ KParseTokens::ASC_DIGIT
,
260 /* 53 5 */ KParseTokens::ASC_DIGIT
,
261 /* 54 6 */ KParseTokens::ASC_DIGIT
,
262 /* 55 7 */ KParseTokens::ASC_DIGIT
,
263 /* 56 8 */ KParseTokens::ASC_DIGIT
,
264 /* 57 9 */ KParseTokens::ASC_DIGIT
,
265 /* 58 : */ KParseTokens::ASC_COLON
,
266 /* 59 ; */ KParseTokens::ASC_OTHER
,
267 /* 60 < */ KParseTokens::ASC_OTHER
,
268 /* 61 = */ KParseTokens::ASC_OTHER
,
269 /* 62 > */ KParseTokens::ASC_OTHER
,
270 /* 63 ? */ KParseTokens::ASC_OTHER
,
271 /* 64 @ */ KParseTokens::ASC_OTHER
,
272 //for ( i = 65; i < 91; i++ )
273 /* 65 A */ KParseTokens::ASC_UPALPHA
,
274 /* 66 B */ KParseTokens::ASC_UPALPHA
,
275 /* 67 C */ KParseTokens::ASC_UPALPHA
,
276 /* 68 D */ KParseTokens::ASC_UPALPHA
,
277 /* 69 E */ KParseTokens::ASC_UPALPHA
,
278 /* 70 F */ KParseTokens::ASC_UPALPHA
,
279 /* 71 G */ KParseTokens::ASC_UPALPHA
,
280 /* 72 H */ KParseTokens::ASC_UPALPHA
,
281 /* 73 I */ KParseTokens::ASC_UPALPHA
,
282 /* 74 J */ KParseTokens::ASC_UPALPHA
,
283 /* 75 K */ KParseTokens::ASC_UPALPHA
,
284 /* 76 L */ KParseTokens::ASC_UPALPHA
,
285 /* 77 M */ KParseTokens::ASC_UPALPHA
,
286 /* 78 N */ KParseTokens::ASC_UPALPHA
,
287 /* 79 O */ KParseTokens::ASC_UPALPHA
,
288 /* 80 P */ KParseTokens::ASC_UPALPHA
,
289 /* 81 Q */ KParseTokens::ASC_UPALPHA
,
290 /* 82 R */ KParseTokens::ASC_UPALPHA
,
291 /* 83 S */ KParseTokens::ASC_UPALPHA
,
292 /* 84 T */ KParseTokens::ASC_UPALPHA
,
293 /* 85 U */ KParseTokens::ASC_UPALPHA
,
294 /* 86 V */ KParseTokens::ASC_UPALPHA
,
295 /* 87 W */ KParseTokens::ASC_UPALPHA
,
296 /* 88 X */ KParseTokens::ASC_UPALPHA
,
297 /* 89 Y */ KParseTokens::ASC_UPALPHA
,
298 /* 90 Z */ KParseTokens::ASC_UPALPHA
,
299 /* 91 [ */ KParseTokens::ASC_OTHER
,
300 /* 92 \ */ KParseTokens::ASC_OTHER
,
301 /* 93 ] */ KParseTokens::ASC_OTHER
,
302 /* 94 ^ */ KParseTokens::ASC_OTHER
,
303 /* 95 _ */ KParseTokens::ASC_UNDERSCORE
,
304 /* 96 ` */ KParseTokens::ASC_OTHER
,
305 //for ( i = 97; i < 123; i++ )
306 /* 97 a */ KParseTokens::ASC_LOALPHA
,
307 /* 98 b */ KParseTokens::ASC_LOALPHA
,
308 /* 99 c */ KParseTokens::ASC_LOALPHA
,
309 /* 100 d */ KParseTokens::ASC_LOALPHA
,
310 /* 101 e */ KParseTokens::ASC_LOALPHA
,
311 /* 102 f */ KParseTokens::ASC_LOALPHA
,
312 /* 103 g */ KParseTokens::ASC_LOALPHA
,
313 /* 104 h */ KParseTokens::ASC_LOALPHA
,
314 /* 105 i */ KParseTokens::ASC_LOALPHA
,
315 /* 106 j */ KParseTokens::ASC_LOALPHA
,
316 /* 107 k */ KParseTokens::ASC_LOALPHA
,
317 /* 108 l */ KParseTokens::ASC_LOALPHA
,
318 /* 109 m */ KParseTokens::ASC_LOALPHA
,
319 /* 110 n */ KParseTokens::ASC_LOALPHA
,
320 /* 111 o */ KParseTokens::ASC_LOALPHA
,
321 /* 112 p */ KParseTokens::ASC_LOALPHA
,
322 /* 113 q */ KParseTokens::ASC_LOALPHA
,
323 /* 114 r */ KParseTokens::ASC_LOALPHA
,
324 /* 115 s */ KParseTokens::ASC_LOALPHA
,
325 /* 116 t */ KParseTokens::ASC_LOALPHA
,
326 /* 117 u */ KParseTokens::ASC_LOALPHA
,
327 /* 118 v */ KParseTokens::ASC_LOALPHA
,
328 /* 119 w */ KParseTokens::ASC_LOALPHA
,
329 /* 120 x */ KParseTokens::ASC_LOALPHA
,
330 /* 121 y */ KParseTokens::ASC_LOALPHA
,
331 /* 122 z */ KParseTokens::ASC_LOALPHA
,
332 /* 123 { */ KParseTokens::ASC_OTHER
,
333 /* 124 | */ KParseTokens::ASC_OTHER
,
334 /* 125 } */ KParseTokens::ASC_OTHER
,
335 /* 126 ~ */ KParseTokens::ASC_OTHER
,
336 /* 127 */ KParseTokens::ASC_OTHER
341 const sal_Unicode
* cclass_Unicode::StrChr( const sal_Unicode
* pStr
, sal_Unicode c
)
355 sal_Int32
cclass_Unicode::getParseTokensType( const sal_Unicode
* aStr
, sal_Int32 nPos
)
357 sal_Unicode c
= aStr
[nPos
];
359 return pParseTokensType
[ sal_uInt8(c
) ];
363 //! all KParseTokens::UNI_... must be matched
364 switch ( u_charType( (sal_uInt32
) c
) )
366 case U_UPPERCASE_LETTER
:
367 return KParseTokens::UNI_UPALPHA
;
368 case U_LOWERCASE_LETTER
:
369 return KParseTokens::UNI_LOALPHA
;
370 case U_TITLECASE_LETTER
:
371 return KParseTokens::UNI_TITLE_ALPHA
;
372 case U_MODIFIER_LETTER
:
373 return KParseTokens::UNI_MODIFIER_LETTER
;
374 case U_OTHER_LETTER
:
375 // Non_Spacing_Mark could not be as leading character
376 if (nPos
== 0) break;
377 // fall through, treat it as Other_Letter.
378 case U_NON_SPACING_MARK
:
379 return KParseTokens::UNI_OTHER_LETTER
;
380 case U_DECIMAL_DIGIT_NUMBER
:
381 return KParseTokens::UNI_DIGIT
;
382 case U_LETTER_NUMBER
:
383 return KParseTokens::UNI_LETTER_NUMBER
;
384 case U_OTHER_NUMBER
:
385 return KParseTokens::UNI_OTHER_NUMBER
;
388 return KParseTokens::UNI_OTHER
;
392 sal_Bool
cclass_Unicode::setupInternational( const Locale
& rLocale
)
394 sal_Bool bChanged
= (aParserLocale
.Language
!= rLocale
.Language
395 || aParserLocale
.Country
!= rLocale
.Country
396 || aParserLocale
.Variant
!= rLocale
.Variant
);
399 aParserLocale
.Language
= rLocale
.Language
;
400 aParserLocale
.Country
= rLocale
.Country
;
401 aParserLocale
.Variant
= rLocale
.Variant
;
403 if ( !mxLocaleData
.is() )
405 mxLocaleData
.set( LocaleData::create(m_xContext
) );
411 void cclass_Unicode::setupParserTable( const Locale
& rLocale
, sal_Int32 startCharTokenType
,
412 const OUString
& userDefinedCharactersStart
, sal_Int32 contCharTokenType
,
413 const OUString
& userDefinedCharactersCont
)
415 bool bIntlEqual
= (rLocale
.Language
== aParserLocale
.Language
&&
416 rLocale
.Country
== aParserLocale
.Country
&&
417 rLocale
.Variant
== aParserLocale
.Variant
);
418 if ( !pTable
|| !bIntlEqual
||
419 startCharTokenType
!= nStartTypes
||
420 contCharTokenType
!= nContTypes
||
421 userDefinedCharactersStart
!= aStartChars
||
422 userDefinedCharactersCont
!= aContChars
)
423 initParserTable( rLocale
, startCharTokenType
, userDefinedCharactersStart
,
424 contCharTokenType
, userDefinedCharactersCont
);
428 void cclass_Unicode::initParserTable( const Locale
& rLocale
, sal_Int32 startCharTokenType
,
429 const OUString
& userDefinedCharactersStart
, sal_Int32 contCharTokenType
,
430 const OUString
& userDefinedCharactersCont
)
433 setupInternational( rLocale
);
434 // Memory of pTable is reused.
436 pTable
= new UPT_FLAG_TYPE
[nDefCnt
];
437 memcpy( pTable
, pDefaultParserTable
, sizeof(UPT_FLAG_TYPE
) * nDefCnt
);
438 // Start and cont tables only need reallocation if different length.
439 if ( pStart
&& userDefinedCharactersStart
.getLength() != aStartChars
.getLength() )
444 if ( pCont
&& userDefinedCharactersCont
.getLength() != aContChars
.getLength() )
449 nStartTypes
= startCharTokenType
;
450 nContTypes
= contCharTokenType
;
451 aStartChars
= userDefinedCharactersStart
;
452 aContChars
= userDefinedCharactersCont
;
455 if( mxLocaleData
.is() )
457 LocaleDataItem aItem
=
458 mxLocaleData
->getLocaleItem( aParserLocale
);
459 //!TODO: theoretically separators may be a string, adjustment would have to be
460 //! done here and in parsing and in ::rtl::math::stringToDouble()
461 cGroupSep
= aItem
.thousandSeparator
.getStr()[0];
462 cDecimalSep
= aItem
.decimalSeparator
.getStr()[0];
465 if ( cGroupSep
< nDefCnt
)
466 pTable
[cGroupSep
] |= TOKEN_VALUE
;
467 if ( cDecimalSep
< nDefCnt
)
468 pTable
[cDecimalSep
] |= TOKEN_CHAR_VALUE
| TOKEN_VALUE
;
470 // Modify characters according to KParseTokens definitions.
472 using namespace KParseTokens
;
475 if ( !(nStartTypes
& ASC_UPALPHA
) )
476 for ( i
= 65; i
< 91; i
++ )
477 pTable
[i
] &= ~TOKEN_CHAR_WORD
; // not allowed as start character
478 if ( !(nContTypes
& ASC_UPALPHA
) )
479 for ( i
= 65; i
< 91; i
++ )
480 pTable
[i
] &= ~TOKEN_WORD
; // not allowed as cont character
482 if ( !(nStartTypes
& ASC_LOALPHA
) )
483 for ( i
= 97; i
< 123; i
++ )
484 pTable
[i
] &= ~TOKEN_CHAR_WORD
; // not allowed as start character
485 if ( !(nContTypes
& ASC_LOALPHA
) )
486 for ( i
= 97; i
< 123; i
++ )
487 pTable
[i
] &= ~TOKEN_WORD
; // not allowed as cont character
489 if ( nStartTypes
& ASC_DIGIT
)
490 for ( i
= 48; i
< 58; i
++ )
491 pTable
[i
] |= TOKEN_CHAR_WORD
; // allowed as start character
492 if ( !(nContTypes
& ASC_DIGIT
) )
493 for ( i
= 48; i
< 58; i
++ )
494 pTable
[i
] &= ~TOKEN_WORD
; // not allowed as cont character
496 if ( !(nStartTypes
& ASC_UNDERSCORE
) )
497 pTable
[95] &= ~TOKEN_CHAR_WORD
; // not allowed as start character
498 if ( !(nContTypes
& ASC_UNDERSCORE
) )
499 pTable
[95] &= ~TOKEN_WORD
; // not allowed as cont character
501 if ( nStartTypes
& ASC_DOLLAR
)
502 pTable
[36] |= TOKEN_CHAR_WORD
; // allowed as start character
503 if ( nContTypes
& ASC_DOLLAR
)
504 pTable
[36] |= TOKEN_WORD
; // allowed as cont character
506 if ( nStartTypes
& ASC_DOT
)
507 pTable
[46] |= TOKEN_CHAR_WORD
; // allowed as start character
508 if ( nContTypes
& ASC_DOT
)
509 pTable
[46] |= TOKEN_WORD
; // allowed as cont character
511 if ( nStartTypes
& ASC_COLON
)
512 pTable
[58] |= TOKEN_CHAR_WORD
; // allowed as start character
513 if ( nContTypes
& ASC_COLON
)
514 pTable
[58] |= TOKEN_WORD
; // allowed as cont character
516 if ( nStartTypes
& ASC_CONTROL
)
517 for ( i
= 1; i
< 32; i
++ )
518 pTable
[i
] |= TOKEN_CHAR_WORD
; // allowed as start character
519 if ( nContTypes
& ASC_CONTROL
)
520 for ( i
= 1; i
< 32; i
++ )
521 pTable
[i
] |= TOKEN_WORD
; // allowed as cont character
523 if ( nStartTypes
& ASC_ANY_BUT_CONTROL
)
524 for ( i
= 32; i
< nDefCnt
; i
++ )
525 pTable
[i
] |= TOKEN_CHAR_WORD
; // allowed as start character
526 if ( nContTypes
& ASC_ANY_BUT_CONTROL
)
527 for ( i
= 32; i
< nDefCnt
; i
++ )
528 pTable
[i
] |= TOKEN_WORD
; // allowed as cont character
532 // Merge in (positively override with) user defined characters.
534 sal_Int32 nLen
= aStartChars
.getLength();
538 pStart
= new UPT_FLAG_TYPE
[ nLen
];
539 const sal_Unicode
* p
= aStartChars
.getStr();
540 for ( sal_Int32 j
=0; j
<nLen
; j
++, p
++ )
542 pStart
[j
] = TOKEN_CHAR_WORD
;
544 pTable
[*p
] |= TOKEN_CHAR_WORD
;
548 nLen
= aContChars
.getLength();
552 pCont
= new UPT_FLAG_TYPE
[ nLen
];
553 const sal_Unicode
* p
= aContChars
.getStr();
554 for ( sal_Int32 j
=0; j
<nLen
; j
++ )
556 pCont
[j
] = TOKEN_WORD
;
558 pTable
[*p
] |= TOKEN_WORD
;
564 void cclass_Unicode::destroyParserTable()
575 UPT_FLAG_TYPE
cclass_Unicode::getFlags( const sal_Unicode
* aStr
, sal_Int32 nPos
)
578 sal_Unicode c
= aStr
[nPos
];
580 nMask
= pTable
[ sal_uInt8(c
) ];
582 nMask
= getFlagsExtended( aStr
, nPos
);
586 case ssRewindFromValue
:
587 case ssIgnoreLeadingInRewind
:
588 case ssGetWordFirstChar
:
589 if ( !(nMask
& TOKEN_CHAR_WORD
) )
591 nMask
|= getStartCharsFlags( c
);
592 if ( nMask
& TOKEN_CHAR_WORD
)
593 nMask
&= ~TOKEN_EXCLUDED
;
598 if ( !(nMask
& TOKEN_WORD
) )
600 nMask
|= getContCharsFlags( c
);
601 if ( nMask
& TOKEN_WORD
)
602 nMask
&= ~TOKEN_EXCLUDED
;
606 ; // other cases aren't needed, no compiler warning
612 UPT_FLAG_TYPE
cclass_Unicode::getFlagsExtended( const sal_Unicode
* aStr
, sal_Int32 nPos
)
614 sal_Unicode c
= aStr
[nPos
];
615 if ( c
== cGroupSep
)
617 else if ( c
== cDecimalSep
)
618 return TOKEN_CHAR_VALUE
| TOKEN_VALUE
;
619 using namespace i18n
;
620 bool bStart
= (eState
== ssGetChar
|| eState
== ssGetWordFirstChar
||
621 eState
== ssRewindFromValue
|| eState
== ssIgnoreLeadingInRewind
);
622 sal_Int32 nTypes
= (bStart
? nStartTypes
: nContTypes
);
624 //! all KParseTokens::UNI_... must be matched
625 switch ( u_charType( (sal_uInt32
) c
) )
627 case U_UPPERCASE_LETTER
:
628 return (nTypes
& KParseTokens::UNI_UPALPHA
) ?
629 (bStart
? TOKEN_CHAR_WORD
: TOKEN_WORD
) :
631 case U_LOWERCASE_LETTER
:
632 return (nTypes
& KParseTokens::UNI_LOALPHA
) ?
633 (bStart
? TOKEN_CHAR_WORD
: TOKEN_WORD
) :
635 case U_TITLECASE_LETTER
:
636 return (nTypes
& KParseTokens::UNI_TITLE_ALPHA
) ?
637 (bStart
? TOKEN_CHAR_WORD
: TOKEN_WORD
) :
639 case U_MODIFIER_LETTER
:
640 return (nTypes
& KParseTokens::UNI_MODIFIER_LETTER
) ?
641 (bStart
? TOKEN_CHAR_WORD
: TOKEN_WORD
) :
643 case U_NON_SPACING_MARK
:
644 case U_COMBINING_SPACING_MARK
:
645 // Non_Spacing_Mark can't be a leading character,
646 // nor can a spacing combining mark.
648 return TOKEN_ILLEGAL
;
649 // fall through, treat it as Other_Letter.
650 case U_OTHER_LETTER
:
651 return (nTypes
& KParseTokens::UNI_OTHER_LETTER
) ?
652 (bStart
? TOKEN_CHAR_WORD
: TOKEN_WORD
) :
654 case U_DECIMAL_DIGIT_NUMBER
:
655 return ((nTypes
& KParseTokens::UNI_DIGIT
) ?
656 (bStart
? TOKEN_CHAR_WORD
: TOKEN_WORD
) :
657 TOKEN_ILLEGAL
) | TOKEN_DIGIT_FLAGS
;
658 case U_LETTER_NUMBER
:
659 return ((nTypes
& KParseTokens::UNI_LETTER_NUMBER
) ?
660 (bStart
? TOKEN_CHAR_WORD
: TOKEN_WORD
) :
661 TOKEN_ILLEGAL
) | TOKEN_DIGIT_FLAGS
;
662 case U_OTHER_NUMBER
:
663 return ((nTypes
& KParseTokens::UNI_OTHER_NUMBER
) ?
664 (bStart
? TOKEN_CHAR_WORD
: TOKEN_WORD
) :
665 TOKEN_ILLEGAL
) | TOKEN_DIGIT_FLAGS
;
666 case U_SPACE_SEPARATOR
:
667 return ((nTypes
& KParseTokens::IGNORE_LEADING_WS
) ?
668 TOKEN_CHAR_DONTCARE
: (bStart
? TOKEN_CHAR_WORD
: (TOKEN_CHAR_DONTCARE
| TOKEN_WORD_SEP
| TOKEN_VALUE_SEP
) ));
671 return TOKEN_ILLEGAL
;
675 UPT_FLAG_TYPE
cclass_Unicode::getStartCharsFlags( sal_Unicode c
)
679 const sal_Unicode
* pStr
= aStartChars
.getStr();
680 const sal_Unicode
* p
= StrChr( pStr
, c
);
682 return pStart
[ p
- pStr
];
684 return TOKEN_ILLEGAL
;
688 UPT_FLAG_TYPE
cclass_Unicode::getContCharsFlags( sal_Unicode c
)
692 const sal_Unicode
* pStr
= aContChars
.getStr();
693 const sal_Unicode
* p
= StrChr( pStr
, c
);
695 return pCont
[ p
- pStr
];
697 return TOKEN_ILLEGAL
;
701 void cclass_Unicode::parseText( ParseResult
& r
, const OUString
& rText
, sal_Int32 nPos
, sal_Int32 nTokenType
)
703 using namespace i18n
;
704 const sal_Unicode
* const pTextStart
= rText
.getStr() + nPos
;
707 //! All the variables below (plus ParseResult) have to be resetted on ssRewindFromValue!
708 const sal_Unicode
* pSym
= pTextStart
;
709 const sal_Unicode
* pSrc
= pSym
;
711 sal_Unicode c
= *pSrc
;
712 sal_Unicode cLast
= 0;
715 bool bMightBeWord
= true;
716 bool bMightBeWordLast
= true;
717 //! All the variables above (plus ParseResult) have to be resetted on ssRewindFromValue!
719 while ( (c
!= 0) && (eState
!= ssStop
) )
721 UPT_FLAG_TYPE nMask
= getFlags( pTextStart
, pSrc
- pTextStart
);
722 if ( nMask
& TOKEN_EXCLUDED
)
725 { // only relevant for ssGetValue fall back
726 if ( eState
== ssGetChar
|| eState
== ssRewindFromValue
||
727 eState
== ssIgnoreLeadingInRewind
)
728 bMightBeWord
= ((nMask
& TOKEN_CHAR_WORD
) != 0);
730 bMightBeWord
= ((nMask
& TOKEN_WORD
) != 0);
732 sal_Int32 nParseTokensType
= getParseTokensType( pTextStart
, pSrc
- pTextStart
);
737 case ssRewindFromValue
:
738 case ssIgnoreLeadingInRewind
:
740 if ( (nMask
& TOKEN_CHAR_VALUE
) && eState
!= ssRewindFromValue
741 && eState
!= ssIgnoreLeadingInRewind
)
742 { //! must be first, may fall back to ssGetWord via bMightBeWord
744 if ( nMask
& TOKEN_VALUE_DIGIT
)
747 r
.TokenType
= KParseType::UNI_NUMBER
;
749 r
.TokenType
= KParseType::ASC_NUMBER
;
751 else if ( c
== cDecimalSep
)
756 eState
= ssRewindFromValue
;
757 // retry for ONE_SINGLE_CHAR or others
760 else if ( nMask
& TOKEN_CHAR_WORD
)
763 r
.TokenType
= KParseType::IDENTNAME
;
765 else if ( nMask
& TOKEN_NAME_SEP
)
767 eState
= ssGetWordFirstChar
;
770 nParseTokensType
= 0; // will be taken of first real character
771 r
.TokenType
= KParseType::SINGLE_QUOTE_NAME
;
773 else if ( nMask
& TOKEN_CHAR_STRING
)
775 eState
= ssGetString
;
777 nParseTokensType
= 0; // will be taken of first real character
778 r
.TokenType
= KParseType::DOUBLE_QUOTE_STRING
;
780 else if ( nMask
& TOKEN_CHAR_DONTCARE
)
782 if ( nStartTypes
& KParseTokens::IGNORE_LEADING_WS
)
784 if (eState
== ssRewindFromValue
)
785 eState
= ssIgnoreLeadingInRewind
;
786 r
.LeadingWhiteSpace
++;
788 nParseTokensType
= 0; // wait until real character
794 else if ( nMask
& TOKEN_CHAR_BOOL
)
797 r
.TokenType
= KParseType::BOOLEAN
;
799 else if ( nMask
& TOKEN_CHAR
)
802 r
.TokenType
= KParseType::ONE_SINGLE_CHAR
;
805 eState
= ssBounce
; // not known
810 if ( nMask
& TOKEN_VALUE_DIGIT
)
813 r
.TokenType
= KParseType::UNI_NUMBER
;
814 else if ( r
.TokenType
!= KParseType::UNI_NUMBER
)
815 r
.TokenType
= KParseType::ASC_NUMBER
;
817 if ( nMask
& TOKEN_VALUE
)
819 if ( c
== cDecimalSep
&& ++nDecSeps
> 1 )
821 if ( pSrc
- pTextStart
== 2 )
822 eState
= ssRewindFromValue
;
823 // consecutive separators
827 // else keep it going
829 else if ( c
== 'E' || c
== 'e' )
831 UPT_FLAG_TYPE nNext
= getFlags( pTextStart
, pSrc
- pTextStart
);
832 if ( nNext
& TOKEN_VALUE_EXP
)
834 else if ( bMightBeWord
&& ((nNext
& TOKEN_WORD
) || !*pSrc
) )
835 { // might be a numerical name (1.2efg)
837 r
.TokenType
= KParseType::IDENTNAME
;
842 else if ( nMask
& TOKEN_VALUE_SIGN
)
844 if ( (cLast
== 'E') || (cLast
== 'e') )
846 UPT_FLAG_TYPE nNext
= getFlags( pTextStart
, pSrc
- pTextStart
);
847 if ( nNext
& TOKEN_VALUE_EXP_VALUE
)
849 else if ( bMightBeWord
&& ((nNext
& TOKEN_WORD
) || !*pSrc
) )
850 { // might be a numerical name (1.2e+fg)
852 r
.TokenType
= KParseType::IDENTNAME
;
857 else if ( bMightBeWord
)
858 { // might be a numerical name (1.2+fg)
860 r
.TokenType
= KParseType::IDENTNAME
;
865 else if ( bMightBeWord
&& (nMask
& TOKEN_WORD
) )
866 { // might be a numerical name (1995.A1)
868 r
.TokenType
= KParseType::IDENTNAME
;
874 case ssGetWordFirstChar
:
879 if ( nMask
& TOKEN_WORD
)
881 else if ( nMask
& TOKEN_NAME_SEP
)
887 aSymbol
+= OUString( pSym
, pSrc
- pSym
- 2 );
888 aSymbol
+= OUString( &c
, 1);
893 aSymbol
+= OUString( pSym
, pSrc
- pSym
- 1 );
908 if ( nMask
& TOKEN_STRING_SEP
)
912 aSymbol
+= OUString( pSym
, pSrc
- pSym
- 2 );
913 aSymbol
+= OUString( &c
, 1);
915 else if ( c
== *pSrc
&&
916 !(nContTypes
& KParseTokens::TWO_DOUBLE_QUOTES_BREAK_STRING
) )
917 { // "" => literal " escaped
918 aSymbol
+= OUString( pSym
, pSrc
- pSym
);
924 aSymbol
+= OUString( pSym
, pSrc
- pSym
- 1 );
932 if ( (nMask
& TOKEN_BOOL
) )
933 eState
= ssStop
; // maximum 2: <, >, <>, <=, >=
941 ; // nothing, no compiler warning
944 if ( eState
== ssRewindFromValue
)
949 aSymbol
= OUString();
955 bMightBeWordLast
= true;
959 if ( !(r
.TokenType
& nTokenType
) )
961 if ( (r
.TokenType
& (KParseType::ASC_NUMBER
| KParseType::UNI_NUMBER
))
962 && (nTokenType
& KParseType::IDENTNAME
) && bMightBeWord
)
963 ; // keep a number that might be a word
964 else if ( r
.LeadingWhiteSpace
== (pSrc
- pTextStart
) )
965 ; // keep ignored white space
966 else if ( !r
.TokenType
&& eState
== ssGetValue
&& (nMask
& TOKEN_VALUE_SEP
) )
967 ; // keep uncertain value
971 if ( eState
== ssBounce
)
976 if ( eState
== ssStopBack
)
979 bMightBeWord
= bMightBeWordLast
;
982 if ( eState
!= ssStop
)
985 r
.StartFlags
|= nParseTokensType
;
987 r
.ContFlags
|= nParseTokensType
;
989 bMightBeWordLast
= bMightBeWord
;
994 // r.CharLen is the length in characters (not code points) of the parsed
995 // token not including any leading white space, change this calculation if
996 // multi-code-point Unicode characters are to be supported.
997 r
.CharLen
= pSrc
- pTextStart
- r
.LeadingWhiteSpace
;
998 r
.EndPos
= nPos
+ (pSrc
- pTextStart
);
999 if ( r
.TokenType
& KParseType::ASC_NUMBER
)
1001 r
.Value
= rtl_math_uStringToDouble( pTextStart
+ r
.LeadingWhiteSpace
,
1002 pTextStart
+ r
.EndPos
, cDecimalSep
, cGroupSep
, NULL
, NULL
);
1004 r
.TokenType
|= KParseType::IDENTNAME
;
1006 else if ( r
.TokenType
& KParseType::UNI_NUMBER
)
1008 if ( !xNatNumSup
.is() )
1010 if ( m_xContext
.is() )
1012 xNatNumSup
= NativeNumberSupplier::create( m_xContext
);
1015 OUString
aTmp( pTextStart
+ r
.LeadingWhiteSpace
, r
.EndPos
- nPos
+
1016 r
.LeadingWhiteSpace
);
1017 // transliterate to ASCII
1018 aTmp
= xNatNumSup
->getNativeNumberString( aTmp
, aParserLocale
,
1019 NativeNumberMode::NATNUM0
);
1020 r
.Value
= ::rtl::math::stringToDouble( aTmp
, cDecimalSep
, cGroupSep
, NULL
, NULL
);
1022 r
.TokenType
|= KParseType::IDENTNAME
;
1024 else if ( r
.TokenType
& (KParseType::SINGLE_QUOTE_NAME
| KParseType::DOUBLE_QUOTE_STRING
) )
1028 aSymbol
+= OUString( pSym
, pSrc
- pSym
);
1029 r
.TokenType
|= KParseType::MISSING_QUOTE
;
1031 r
.DequotedNameOrString
= aSymbol
;
1037 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */