1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
21 #include <cclass_unicode.hxx>
22 #include <unicode/uchar.h>
23 #include <rtl/math.hxx>
24 #include <rtl/ustring.hxx>
25 #include <com/sun/star/i18n/KParseTokens.hpp>
26 #include <com/sun/star/i18n/KParseType.hpp>
27 #include <com/sun/star/i18n/UnicodeType.hpp>
28 #include <com/sun/star/i18n/LocaleData.hpp>
29 #include <com/sun/star/i18n/NativeNumberMode.hpp>
30 #include <com/sun/star/i18n/NativeNumberSupplier.hpp>
31 #include <comphelper/processfactory.hxx>
35 using namespace ::com::sun::star::uno
;
36 using namespace ::com::sun::star::lang
;
38 namespace com
{ namespace sun
{ namespace star
{ namespace i18n
{
40 const UPT_FLAG_TYPE
cclass_Unicode::TOKEN_ILLEGAL
= 0x00000000;
41 const UPT_FLAG_TYPE
cclass_Unicode::TOKEN_CHAR
= 0x00000001;
42 const UPT_FLAG_TYPE
cclass_Unicode::TOKEN_CHAR_BOOL
= 0x00000002;
43 const UPT_FLAG_TYPE
cclass_Unicode::TOKEN_CHAR_WORD
= 0x00000004;
44 const UPT_FLAG_TYPE
cclass_Unicode::TOKEN_CHAR_VALUE
= 0x00000008;
45 const UPT_FLAG_TYPE
cclass_Unicode::TOKEN_CHAR_STRING
= 0x00000010;
46 const UPT_FLAG_TYPE
cclass_Unicode::TOKEN_CHAR_DONTCARE
= 0x00000020;
47 const UPT_FLAG_TYPE
cclass_Unicode::TOKEN_BOOL
= 0x00000040;
48 const UPT_FLAG_TYPE
cclass_Unicode::TOKEN_WORD
= 0x00000080;
49 const UPT_FLAG_TYPE
cclass_Unicode::TOKEN_WORD_SEP
= 0x00000100;
50 const UPT_FLAG_TYPE
cclass_Unicode::TOKEN_VALUE
= 0x00000200;
51 const UPT_FLAG_TYPE
cclass_Unicode::TOKEN_VALUE_SEP
= 0x00000400;
52 const UPT_FLAG_TYPE
cclass_Unicode::TOKEN_VALUE_EXP
= 0x00000800;
53 const UPT_FLAG_TYPE
cclass_Unicode::TOKEN_VALUE_SIGN
= 0x00001000;
54 const UPT_FLAG_TYPE
cclass_Unicode::TOKEN_VALUE_EXP_VALUE
= 0x00002000;
55 const UPT_FLAG_TYPE
cclass_Unicode::TOKEN_VALUE_DIGIT
= 0x00004000;
56 const UPT_FLAG_TYPE
cclass_Unicode::TOKEN_NAME_SEP
= 0x20000000;
57 const UPT_FLAG_TYPE
cclass_Unicode::TOKEN_STRING_SEP
= 0x40000000;
58 const UPT_FLAG_TYPE
cclass_Unicode::TOKEN_EXCLUDED
= 0x80000000;
60 #define TOKEN_DIGIT_FLAGS (TOKEN_CHAR_VALUE | TOKEN_VALUE | TOKEN_VALUE_EXP | TOKEN_VALUE_EXP_VALUE | TOKEN_VALUE_DIGIT)
62 // Default identifier/name specification is [A-Za-z_][A-Za-z0-9_]*
64 const sal_uInt8
cclass_Unicode::nDefCnt
= 128;
65 const UPT_FLAG_TYPE
cclass_Unicode::pDefaultParserTable
[ nDefCnt
] =
67 // (...) == Calc formula compiler specific, commented out and modified
69 /* \0 */ TOKEN_EXCLUDED
,
78 /* 9 \t */ TOKEN_CHAR_DONTCARE
| TOKEN_WORD_SEP
| TOKEN_VALUE_SEP
, // (TOKEN_ILLEGAL)
80 /* 11 \v */ TOKEN_CHAR_DONTCARE
| TOKEN_WORD_SEP
| TOKEN_VALUE_SEP
, // (TOKEN_ILLEGAL)
101 /* 32 */ TOKEN_CHAR_DONTCARE
| TOKEN_WORD_SEP
| TOKEN_VALUE_SEP
,
102 /* 33 ! */ TOKEN_CHAR
| TOKEN_WORD_SEP
| TOKEN_VALUE_SEP
,
103 /* 34 " */ TOKEN_CHAR_STRING
| TOKEN_STRING_SEP
,
104 /* 35 # */ TOKEN_CHAR
| TOKEN_WORD_SEP
| TOKEN_VALUE_SEP
, // (TOKEN_WORD_SEP)
105 /* 36 $ */ TOKEN_CHAR
| TOKEN_WORD_SEP
| TOKEN_VALUE_SEP
, // (TOKEN_CHAR_WORD | TOKEN_WORD)
106 /* 37 % */ TOKEN_CHAR
| TOKEN_WORD_SEP
| TOKEN_VALUE_SEP
, // (TOKEN_VALUE)
107 /* 38 & */ TOKEN_CHAR
| TOKEN_WORD_SEP
| TOKEN_VALUE_SEP
,
108 /* 39 ' */ TOKEN_NAME_SEP
,
109 /* 40 ( */ TOKEN_CHAR
| TOKEN_WORD_SEP
| TOKEN_VALUE_SEP
,
110 /* 41 ) */ TOKEN_CHAR
| TOKEN_WORD_SEP
| TOKEN_VALUE_SEP
,
111 /* 42 * */ TOKEN_CHAR
| TOKEN_WORD_SEP
| TOKEN_VALUE_SEP
,
112 /* 43 + */ TOKEN_CHAR
| TOKEN_WORD_SEP
| TOKEN_VALUE_SEP
| TOKEN_VALUE_EXP
| TOKEN_VALUE_SIGN
,
113 /* 44 , */ TOKEN_CHAR
| TOKEN_WORD_SEP
| TOKEN_VALUE_SEP
, // (TOKEN_CHAR_VALUE | TOKEN_VALUE)
114 /* 45 - */ TOKEN_CHAR
| TOKEN_WORD_SEP
| TOKEN_VALUE_SEP
| TOKEN_VALUE_EXP
| TOKEN_VALUE_SIGN
,
115 /* 46 . */ TOKEN_CHAR
| TOKEN_WORD_SEP
| TOKEN_VALUE_SEP
, // (TOKEN_WORD | TOKEN_CHAR_VALUE | TOKEN_VALUE)
116 /* 47 / */ TOKEN_CHAR
| TOKEN_WORD_SEP
| TOKEN_VALUE_SEP
,
117 //for ( i = 48; i < 58; i++ )
118 /* 48 0 */ TOKEN_DIGIT_FLAGS
| TOKEN_WORD
,
119 /* 49 1 */ TOKEN_DIGIT_FLAGS
| TOKEN_WORD
,
120 /* 50 2 */ TOKEN_DIGIT_FLAGS
| TOKEN_WORD
,
121 /* 51 3 */ TOKEN_DIGIT_FLAGS
| TOKEN_WORD
,
122 /* 52 4 */ TOKEN_DIGIT_FLAGS
| TOKEN_WORD
,
123 /* 53 5 */ TOKEN_DIGIT_FLAGS
| TOKEN_WORD
,
124 /* 54 6 */ TOKEN_DIGIT_FLAGS
| TOKEN_WORD
,
125 /* 55 7 */ TOKEN_DIGIT_FLAGS
| TOKEN_WORD
,
126 /* 56 8 */ TOKEN_DIGIT_FLAGS
| TOKEN_WORD
,
127 /* 57 9 */ TOKEN_DIGIT_FLAGS
| TOKEN_WORD
,
128 /* 58 : */ TOKEN_CHAR
| TOKEN_WORD_SEP
| TOKEN_VALUE_SEP
, // (TOKEN_WORD)
129 /* 59 ; */ TOKEN_CHAR
| TOKEN_WORD_SEP
| TOKEN_VALUE_SEP
,
130 /* 60 < */ TOKEN_CHAR_BOOL
| TOKEN_WORD_SEP
| TOKEN_VALUE_SEP
,
131 /* 61 = */ TOKEN_CHAR
| TOKEN_BOOL
| TOKEN_WORD_SEP
| TOKEN_VALUE_SEP
,
132 /* 62 > */ TOKEN_CHAR_BOOL
| TOKEN_BOOL
| TOKEN_WORD_SEP
| TOKEN_VALUE_SEP
,
133 /* 63 ? */ TOKEN_CHAR
| TOKEN_WORD_SEP
| TOKEN_VALUE_SEP
, // (TOKEN_CHAR_WORD | TOKEN_WORD)
134 /* 64 @ */ TOKEN_CHAR
| TOKEN_WORD_SEP
| TOKEN_VALUE_SEP
, // (TOKEN_ILLEGAL // UNUSED)
135 //for ( i = 65; i < 91; i++ )
136 /* 65 A */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
137 /* 66 B */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
138 /* 67 C */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
139 /* 68 D */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
140 /* 69 E */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
141 /* 70 F */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
142 /* 71 G */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
143 /* 72 H */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
144 /* 73 I */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
145 /* 74 J */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
146 /* 75 K */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
147 /* 76 L */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
148 /* 77 M */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
149 /* 78 N */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
150 /* 79 O */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
151 /* 80 P */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
152 /* 81 Q */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
153 /* 82 R */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
154 /* 83 S */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
155 /* 84 T */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
156 /* 85 U */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
157 /* 86 V */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
158 /* 87 W */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
159 /* 88 X */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
160 /* 89 Y */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
161 /* 90 Z */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
162 /* 91 [ */ TOKEN_CHAR
| TOKEN_WORD_SEP
| TOKEN_VALUE_SEP
, // (TOKEN_ILLEGAL // UNUSED)
163 /* 92 \ */ TOKEN_CHAR
| TOKEN_WORD_SEP
| TOKEN_VALUE_SEP
, // (TOKEN_ILLEGAL // UNUSED)
164 /* 93 ] */ TOKEN_CHAR
| TOKEN_WORD_SEP
| TOKEN_VALUE_SEP
, // (TOKEN_ILLEGAL // UNUSED)
165 /* 94 ^ */ TOKEN_CHAR
| TOKEN_WORD_SEP
| TOKEN_VALUE_SEP
,
166 /* 95 _ */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
167 /* 96 ` */ TOKEN_CHAR
| TOKEN_WORD_SEP
| TOKEN_VALUE_SEP
, // (TOKEN_ILLEGAL // UNUSED)
168 //for ( i = 97; i < 123; i++ )
169 /* 97 a */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
170 /* 98 b */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
171 /* 99 c */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
172 /* 100 d */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
173 /* 101 e */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
174 /* 102 f */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
175 /* 103 g */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
176 /* 104 h */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
177 /* 105 i */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
178 /* 106 j */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
179 /* 107 k */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
180 /* 108 l */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
181 /* 109 m */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
182 /* 110 n */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
183 /* 111 o */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
184 /* 112 p */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
185 /* 113 q */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
186 /* 114 r */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
187 /* 115 s */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
188 /* 116 t */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
189 /* 117 u */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
190 /* 118 v */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
191 /* 119 w */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
192 /* 120 x */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
193 /* 121 y */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
194 /* 122 z */ TOKEN_CHAR_WORD
| TOKEN_WORD
,
195 /* 123 { */ TOKEN_CHAR
| TOKEN_WORD_SEP
| TOKEN_VALUE_SEP
, // (TOKEN_ILLEGAL // UNUSED)
196 /* 124 | */ TOKEN_CHAR
| TOKEN_WORD_SEP
| TOKEN_VALUE_SEP
, // (TOKEN_ILLEGAL // UNUSED)
197 /* 125 } */ TOKEN_CHAR
| TOKEN_WORD_SEP
| TOKEN_VALUE_SEP
, // (TOKEN_ILLEGAL // UNUSED)
198 /* 126 ~ */ TOKEN_CHAR
| TOKEN_WORD_SEP
| TOKEN_VALUE_SEP
, // (TOKEN_ILLEGAL // UNUSED)
199 /* 127 */ TOKEN_CHAR
| TOKEN_WORD_SEP
| TOKEN_VALUE_SEP
// (TOKEN_ILLEGAL // UNUSED)
203 const sal_Int32
cclass_Unicode::pParseTokensType
[ nDefCnt
] =
205 /* \0 */ KParseTokens::ASC_OTHER
,
206 KParseTokens::ASC_CONTROL
,
207 KParseTokens::ASC_CONTROL
,
208 KParseTokens::ASC_CONTROL
,
209 KParseTokens::ASC_CONTROL
,
210 KParseTokens::ASC_CONTROL
,
211 KParseTokens::ASC_CONTROL
,
212 KParseTokens::ASC_CONTROL
,
213 KParseTokens::ASC_CONTROL
,
214 /* 9 \t */ KParseTokens::ASC_CONTROL
,
215 KParseTokens::ASC_CONTROL
,
216 /* 11 \v */ KParseTokens::ASC_CONTROL
,
217 KParseTokens::ASC_CONTROL
,
218 KParseTokens::ASC_CONTROL
,
219 KParseTokens::ASC_CONTROL
,
220 KParseTokens::ASC_CONTROL
,
221 KParseTokens::ASC_CONTROL
,
222 KParseTokens::ASC_CONTROL
,
223 KParseTokens::ASC_CONTROL
,
224 KParseTokens::ASC_CONTROL
,
225 KParseTokens::ASC_CONTROL
,
226 KParseTokens::ASC_CONTROL
,
227 KParseTokens::ASC_CONTROL
,
228 KParseTokens::ASC_CONTROL
,
229 KParseTokens::ASC_CONTROL
,
230 KParseTokens::ASC_CONTROL
,
231 KParseTokens::ASC_CONTROL
,
232 KParseTokens::ASC_CONTROL
,
233 KParseTokens::ASC_CONTROL
,
234 KParseTokens::ASC_CONTROL
,
235 KParseTokens::ASC_CONTROL
,
236 KParseTokens::ASC_CONTROL
,
237 /* 32 */ KParseTokens::ASC_OTHER
,
238 /* 33 ! */ KParseTokens::ASC_OTHER
,
239 /* 34 " */ KParseTokens::ASC_OTHER
,
240 /* 35 # */ KParseTokens::ASC_OTHER
,
241 /* 36 $ */ KParseTokens::ASC_DOLLAR
,
242 /* 37 % */ KParseTokens::ASC_OTHER
,
243 /* 38 & */ KParseTokens::ASC_OTHER
,
244 /* 39 ' */ KParseTokens::ASC_OTHER
,
245 /* 40 ( */ KParseTokens::ASC_OTHER
,
246 /* 41 ) */ KParseTokens::ASC_OTHER
,
247 /* 42 * */ KParseTokens::ASC_OTHER
,
248 /* 43 + */ KParseTokens::ASC_OTHER
,
249 /* 44 , */ KParseTokens::ASC_OTHER
,
250 /* 45 - */ KParseTokens::ASC_OTHER
,
251 /* 46 . */ KParseTokens::ASC_DOT
,
252 /* 47 / */ KParseTokens::ASC_OTHER
,
253 //for ( i = 48; i < 58; i++ )
254 /* 48 0 */ KParseTokens::ASC_DIGIT
,
255 /* 49 1 */ KParseTokens::ASC_DIGIT
,
256 /* 50 2 */ KParseTokens::ASC_DIGIT
,
257 /* 51 3 */ KParseTokens::ASC_DIGIT
,
258 /* 52 4 */ KParseTokens::ASC_DIGIT
,
259 /* 53 5 */ KParseTokens::ASC_DIGIT
,
260 /* 54 6 */ KParseTokens::ASC_DIGIT
,
261 /* 55 7 */ KParseTokens::ASC_DIGIT
,
262 /* 56 8 */ KParseTokens::ASC_DIGIT
,
263 /* 57 9 */ KParseTokens::ASC_DIGIT
,
264 /* 58 : */ KParseTokens::ASC_COLON
,
265 /* 59 ; */ KParseTokens::ASC_OTHER
,
266 /* 60 < */ KParseTokens::ASC_OTHER
,
267 /* 61 = */ KParseTokens::ASC_OTHER
,
268 /* 62 > */ KParseTokens::ASC_OTHER
,
269 /* 63 ? */ KParseTokens::ASC_OTHER
,
270 /* 64 @ */ KParseTokens::ASC_OTHER
,
271 //for ( i = 65; i < 91; i++ )
272 /* 65 A */ KParseTokens::ASC_UPALPHA
,
273 /* 66 B */ KParseTokens::ASC_UPALPHA
,
274 /* 67 C */ KParseTokens::ASC_UPALPHA
,
275 /* 68 D */ KParseTokens::ASC_UPALPHA
,
276 /* 69 E */ KParseTokens::ASC_UPALPHA
,
277 /* 70 F */ KParseTokens::ASC_UPALPHA
,
278 /* 71 G */ KParseTokens::ASC_UPALPHA
,
279 /* 72 H */ KParseTokens::ASC_UPALPHA
,
280 /* 73 I */ KParseTokens::ASC_UPALPHA
,
281 /* 74 J */ KParseTokens::ASC_UPALPHA
,
282 /* 75 K */ KParseTokens::ASC_UPALPHA
,
283 /* 76 L */ KParseTokens::ASC_UPALPHA
,
284 /* 77 M */ KParseTokens::ASC_UPALPHA
,
285 /* 78 N */ KParseTokens::ASC_UPALPHA
,
286 /* 79 O */ KParseTokens::ASC_UPALPHA
,
287 /* 80 P */ KParseTokens::ASC_UPALPHA
,
288 /* 81 Q */ KParseTokens::ASC_UPALPHA
,
289 /* 82 R */ KParseTokens::ASC_UPALPHA
,
290 /* 83 S */ KParseTokens::ASC_UPALPHA
,
291 /* 84 T */ KParseTokens::ASC_UPALPHA
,
292 /* 85 U */ KParseTokens::ASC_UPALPHA
,
293 /* 86 V */ KParseTokens::ASC_UPALPHA
,
294 /* 87 W */ KParseTokens::ASC_UPALPHA
,
295 /* 88 X */ KParseTokens::ASC_UPALPHA
,
296 /* 89 Y */ KParseTokens::ASC_UPALPHA
,
297 /* 90 Z */ KParseTokens::ASC_UPALPHA
,
298 /* 91 [ */ KParseTokens::ASC_OTHER
,
299 /* 92 \ */ KParseTokens::ASC_OTHER
,
300 /* 93 ] */ KParseTokens::ASC_OTHER
,
301 /* 94 ^ */ KParseTokens::ASC_OTHER
,
302 /* 95 _ */ KParseTokens::ASC_UNDERSCORE
,
303 /* 96 ` */ KParseTokens::ASC_OTHER
,
304 //for ( i = 97; i < 123; i++ )
305 /* 97 a */ KParseTokens::ASC_LOALPHA
,
306 /* 98 b */ KParseTokens::ASC_LOALPHA
,
307 /* 99 c */ KParseTokens::ASC_LOALPHA
,
308 /* 100 d */ KParseTokens::ASC_LOALPHA
,
309 /* 101 e */ KParseTokens::ASC_LOALPHA
,
310 /* 102 f */ KParseTokens::ASC_LOALPHA
,
311 /* 103 g */ KParseTokens::ASC_LOALPHA
,
312 /* 104 h */ KParseTokens::ASC_LOALPHA
,
313 /* 105 i */ KParseTokens::ASC_LOALPHA
,
314 /* 106 j */ KParseTokens::ASC_LOALPHA
,
315 /* 107 k */ KParseTokens::ASC_LOALPHA
,
316 /* 108 l */ KParseTokens::ASC_LOALPHA
,
317 /* 109 m */ KParseTokens::ASC_LOALPHA
,
318 /* 110 n */ KParseTokens::ASC_LOALPHA
,
319 /* 111 o */ KParseTokens::ASC_LOALPHA
,
320 /* 112 p */ KParseTokens::ASC_LOALPHA
,
321 /* 113 q */ KParseTokens::ASC_LOALPHA
,
322 /* 114 r */ KParseTokens::ASC_LOALPHA
,
323 /* 115 s */ KParseTokens::ASC_LOALPHA
,
324 /* 116 t */ KParseTokens::ASC_LOALPHA
,
325 /* 117 u */ KParseTokens::ASC_LOALPHA
,
326 /* 118 v */ KParseTokens::ASC_LOALPHA
,
327 /* 119 w */ KParseTokens::ASC_LOALPHA
,
328 /* 120 x */ KParseTokens::ASC_LOALPHA
,
329 /* 121 y */ KParseTokens::ASC_LOALPHA
,
330 /* 122 z */ KParseTokens::ASC_LOALPHA
,
331 /* 123 { */ KParseTokens::ASC_OTHER
,
332 /* 124 | */ KParseTokens::ASC_OTHER
,
333 /* 125 } */ KParseTokens::ASC_OTHER
,
334 /* 126 ~ */ KParseTokens::ASC_OTHER
,
335 /* 127 */ KParseTokens::ASC_OTHER
340 const sal_Unicode
* cclass_Unicode::StrChr( const sal_Unicode
* pStr
, sal_Unicode c
)
354 sal_Int32
cclass_Unicode::getParseTokensType( const sal_Unicode
* aStr
, sal_Int32 nPos
)
356 sal_Unicode c
= aStr
[nPos
];
358 return pParseTokensType
[ sal_uInt8(c
) ];
362 //! all KParseTokens::UNI_... must be matched
363 switch ( u_charType( (sal_uInt32
) c
) )
365 case U_UPPERCASE_LETTER
:
366 return KParseTokens::UNI_UPALPHA
;
367 case U_LOWERCASE_LETTER
:
368 return KParseTokens::UNI_LOALPHA
;
369 case U_TITLECASE_LETTER
:
370 return KParseTokens::UNI_TITLE_ALPHA
;
371 case U_MODIFIER_LETTER
:
372 return KParseTokens::UNI_MODIFIER_LETTER
;
373 case U_OTHER_LETTER
:
374 // Non_Spacing_Mark could not be as leading character
375 if (nPos
== 0) break;
376 // fall through, treat it as Other_Letter.
377 case U_NON_SPACING_MARK
:
378 return KParseTokens::UNI_OTHER_LETTER
;
379 case U_DECIMAL_DIGIT_NUMBER
:
380 return KParseTokens::UNI_DIGIT
;
381 case U_LETTER_NUMBER
:
382 return KParseTokens::UNI_LETTER_NUMBER
;
383 case U_OTHER_NUMBER
:
384 return KParseTokens::UNI_OTHER_NUMBER
;
387 return KParseTokens::UNI_OTHER
;
391 bool cclass_Unicode::setupInternational( const Locale
& rLocale
)
393 bool bChanged
= (aParserLocale
.Language
!= rLocale
.Language
394 || aParserLocale
.Country
!= rLocale
.Country
395 || aParserLocale
.Variant
!= rLocale
.Variant
);
398 aParserLocale
.Language
= rLocale
.Language
;
399 aParserLocale
.Country
= rLocale
.Country
;
400 aParserLocale
.Variant
= rLocale
.Variant
;
402 if ( !mxLocaleData
.is() )
404 mxLocaleData
.set( LocaleData::create(m_xContext
) );
410 void cclass_Unicode::setupParserTable( const Locale
& rLocale
, sal_Int32 startCharTokenType
,
411 const OUString
& userDefinedCharactersStart
, sal_Int32 contCharTokenType
,
412 const OUString
& userDefinedCharactersCont
)
414 bool bIntlEqual
= (rLocale
.Language
== aParserLocale
.Language
&&
415 rLocale
.Country
== aParserLocale
.Country
&&
416 rLocale
.Variant
== aParserLocale
.Variant
);
417 if ( !pTable
|| !bIntlEqual
||
418 startCharTokenType
!= nStartTypes
||
419 contCharTokenType
!= nContTypes
||
420 userDefinedCharactersStart
!= aStartChars
||
421 userDefinedCharactersCont
!= aContChars
)
422 initParserTable( rLocale
, startCharTokenType
, userDefinedCharactersStart
,
423 contCharTokenType
, userDefinedCharactersCont
);
427 void cclass_Unicode::initParserTable( const Locale
& rLocale
, sal_Int32 startCharTokenType
,
428 const OUString
& userDefinedCharactersStart
, sal_Int32 contCharTokenType
,
429 const OUString
& userDefinedCharactersCont
)
432 setupInternational( rLocale
);
433 // Memory of pTable is reused.
435 pTable
= new UPT_FLAG_TYPE
[nDefCnt
];
436 memcpy( pTable
, pDefaultParserTable
, sizeof(UPT_FLAG_TYPE
) * nDefCnt
);
437 // Start and cont tables only need reallocation if different length.
438 if ( pStart
&& userDefinedCharactersStart
.getLength() != aStartChars
.getLength() )
443 if ( pCont
&& userDefinedCharactersCont
.getLength() != aContChars
.getLength() )
448 nStartTypes
= startCharTokenType
;
449 nContTypes
= contCharTokenType
;
450 aStartChars
= userDefinedCharactersStart
;
451 aContChars
= userDefinedCharactersCont
;
454 if( mxLocaleData
.is() )
456 LocaleDataItem aItem
=
457 mxLocaleData
->getLocaleItem( aParserLocale
);
458 //!TODO: theoretically separators may be a string, adjustment would have to be
459 //! done here and in parsing and in ::rtl::math::stringToDouble()
460 cGroupSep
= aItem
.thousandSeparator
[0];
461 cDecimalSep
= aItem
.decimalSeparator
[0];
464 if ( cGroupSep
< nDefCnt
)
465 pTable
[cGroupSep
] |= TOKEN_VALUE
;
466 if ( cDecimalSep
< nDefCnt
)
467 pTable
[cDecimalSep
] |= TOKEN_CHAR_VALUE
| TOKEN_VALUE
;
469 // Modify characters according to KParseTokens definitions.
471 using namespace KParseTokens
;
474 if ( !(nStartTypes
& ASC_UPALPHA
) )
475 for ( i
= 65; i
< 91; i
++ )
476 pTable
[i
] &= ~TOKEN_CHAR_WORD
; // not allowed as start character
477 if ( !(nContTypes
& ASC_UPALPHA
) )
478 for ( i
= 65; i
< 91; i
++ )
479 pTable
[i
] &= ~TOKEN_WORD
; // not allowed as cont character
481 if ( !(nStartTypes
& ASC_LOALPHA
) )
482 for ( i
= 97; i
< 123; i
++ )
483 pTable
[i
] &= ~TOKEN_CHAR_WORD
; // not allowed as start character
484 if ( !(nContTypes
& ASC_LOALPHA
) )
485 for ( i
= 97; i
< 123; i
++ )
486 pTable
[i
] &= ~TOKEN_WORD
; // not allowed as cont character
488 if ( nStartTypes
& ASC_DIGIT
)
489 for ( i
= 48; i
< 58; i
++ )
490 pTable
[i
] |= TOKEN_CHAR_WORD
; // allowed as start character
491 if ( !(nContTypes
& ASC_DIGIT
) )
492 for ( i
= 48; i
< 58; i
++ )
493 pTable
[i
] &= ~TOKEN_WORD
; // not allowed as cont character
495 if ( !(nStartTypes
& ASC_UNDERSCORE
) )
496 pTable
[95] &= ~TOKEN_CHAR_WORD
; // not allowed as start character
497 if ( !(nContTypes
& ASC_UNDERSCORE
) )
498 pTable
[95] &= ~TOKEN_WORD
; // not allowed as cont character
500 if ( nStartTypes
& ASC_DOLLAR
)
501 pTable
[36] |= TOKEN_CHAR_WORD
; // allowed as start character
502 if ( nContTypes
& ASC_DOLLAR
)
503 pTable
[36] |= TOKEN_WORD
; // allowed as cont character
505 if ( nStartTypes
& ASC_DOT
)
506 pTable
[46] |= TOKEN_CHAR_WORD
; // allowed as start character
507 if ( nContTypes
& ASC_DOT
)
508 pTable
[46] |= TOKEN_WORD
; // allowed as cont character
510 if ( nStartTypes
& ASC_COLON
)
511 pTable
[58] |= TOKEN_CHAR_WORD
; // allowed as start character
512 if ( nContTypes
& ASC_COLON
)
513 pTable
[58] |= TOKEN_WORD
; // allowed as cont character
515 if ( nStartTypes
& ASC_CONTROL
)
516 for ( i
= 1; i
< 32; i
++ )
517 pTable
[i
] |= TOKEN_CHAR_WORD
; // allowed as start character
518 if ( nContTypes
& ASC_CONTROL
)
519 for ( i
= 1; i
< 32; i
++ )
520 pTable
[i
] |= TOKEN_WORD
; // allowed as cont character
522 if ( nStartTypes
& ASC_ANY_BUT_CONTROL
)
523 for ( i
= 32; i
< nDefCnt
; i
++ )
524 pTable
[i
] |= TOKEN_CHAR_WORD
; // allowed as start character
525 if ( nContTypes
& ASC_ANY_BUT_CONTROL
)
526 for ( i
= 32; i
< nDefCnt
; i
++ )
527 pTable
[i
] |= TOKEN_WORD
; // allowed as cont character
531 // Merge in (positively override with) user defined characters.
533 sal_Int32 nLen
= aStartChars
.getLength();
537 pStart
= new UPT_FLAG_TYPE
[ nLen
];
538 const sal_Unicode
* p
= aStartChars
.getStr();
539 for ( sal_Int32 j
=0; j
<nLen
; j
++, p
++ )
541 pStart
[j
] = TOKEN_CHAR_WORD
;
543 pTable
[*p
] |= TOKEN_CHAR_WORD
;
547 nLen
= aContChars
.getLength();
551 pCont
= new UPT_FLAG_TYPE
[ nLen
];
552 const sal_Unicode
* p
= aContChars
.getStr();
553 for ( sal_Int32 j
=0; j
<nLen
; j
++ )
555 pCont
[j
] = TOKEN_WORD
;
557 pTable
[*p
] |= TOKEN_WORD
;
563 void cclass_Unicode::destroyParserTable()
574 UPT_FLAG_TYPE
cclass_Unicode::getFlags( const sal_Unicode
* aStr
, sal_Int32 nPos
)
577 sal_Unicode c
= aStr
[nPos
];
579 nMask
= pTable
[ sal_uInt8(c
) ];
581 nMask
= getFlagsExtended( aStr
, nPos
);
585 case ssRewindFromValue
:
586 case ssIgnoreLeadingInRewind
:
587 case ssGetWordFirstChar
:
588 if ( !(nMask
& TOKEN_CHAR_WORD
) )
590 nMask
|= getStartCharsFlags( c
);
591 if ( nMask
& TOKEN_CHAR_WORD
)
592 nMask
&= ~TOKEN_EXCLUDED
;
597 if ( !(nMask
& TOKEN_WORD
) )
599 nMask
|= getContCharsFlags( c
);
600 if ( nMask
& TOKEN_WORD
)
601 nMask
&= ~TOKEN_EXCLUDED
;
605 ; // other cases aren't needed, no compiler warning
611 UPT_FLAG_TYPE
cclass_Unicode::getFlagsExtended( const sal_Unicode
* aStr
, sal_Int32 nPos
)
613 sal_Unicode c
= aStr
[nPos
];
614 if ( c
== cGroupSep
)
616 else if ( c
== cDecimalSep
)
617 return TOKEN_CHAR_VALUE
| TOKEN_VALUE
;
618 using namespace i18n
;
619 bool bStart
= (eState
== ssGetChar
|| eState
== ssGetWordFirstChar
||
620 eState
== ssRewindFromValue
|| eState
== ssIgnoreLeadingInRewind
);
621 sal_Int32 nTypes
= (bStart
? nStartTypes
: nContTypes
);
623 //! all KParseTokens::UNI_... must be matched
624 switch ( u_charType( (sal_uInt32
) c
) )
626 case U_UPPERCASE_LETTER
:
627 return (nTypes
& KParseTokens::UNI_UPALPHA
) ?
628 (bStart
? TOKEN_CHAR_WORD
: TOKEN_WORD
) :
630 case U_LOWERCASE_LETTER
:
631 return (nTypes
& KParseTokens::UNI_LOALPHA
) ?
632 (bStart
? TOKEN_CHAR_WORD
: TOKEN_WORD
) :
634 case U_TITLECASE_LETTER
:
635 return (nTypes
& KParseTokens::UNI_TITLE_ALPHA
) ?
636 (bStart
? TOKEN_CHAR_WORD
: TOKEN_WORD
) :
638 case U_MODIFIER_LETTER
:
639 return (nTypes
& KParseTokens::UNI_MODIFIER_LETTER
) ?
640 (bStart
? TOKEN_CHAR_WORD
: TOKEN_WORD
) :
642 case U_NON_SPACING_MARK
:
643 case U_COMBINING_SPACING_MARK
:
644 // Non_Spacing_Mark can't be a leading character,
645 // nor can a spacing combining mark.
647 return TOKEN_ILLEGAL
;
648 // fall through, treat it as Other_Letter.
649 case U_OTHER_LETTER
:
650 return (nTypes
& KParseTokens::UNI_OTHER_LETTER
) ?
651 (bStart
? TOKEN_CHAR_WORD
: TOKEN_WORD
) :
653 case U_DECIMAL_DIGIT_NUMBER
:
654 return ((nTypes
& KParseTokens::UNI_DIGIT
) ?
655 (bStart
? TOKEN_CHAR_WORD
: TOKEN_WORD
) :
656 TOKEN_ILLEGAL
) | TOKEN_DIGIT_FLAGS
;
657 case U_LETTER_NUMBER
:
658 return ((nTypes
& KParseTokens::UNI_LETTER_NUMBER
) ?
659 (bStart
? TOKEN_CHAR_WORD
: TOKEN_WORD
) :
660 TOKEN_ILLEGAL
) | TOKEN_DIGIT_FLAGS
;
661 case U_OTHER_NUMBER
:
662 return ((nTypes
& KParseTokens::UNI_OTHER_NUMBER
) ?
663 (bStart
? TOKEN_CHAR_WORD
: TOKEN_WORD
) :
664 TOKEN_ILLEGAL
) | TOKEN_DIGIT_FLAGS
;
665 case U_SPACE_SEPARATOR
:
666 return ((nTypes
& KParseTokens::IGNORE_LEADING_WS
) ?
667 TOKEN_CHAR_DONTCARE
: (bStart
? TOKEN_CHAR_WORD
: (TOKEN_CHAR_DONTCARE
| TOKEN_WORD_SEP
| TOKEN_VALUE_SEP
) ));
668 case U_OTHER_PUNCTUATION
:
669 // fdo#61754 Lets see (if we not at the start) if this is midletter
670 // punctuation and allow it in a word if it is similarly to
671 // U_NON_SPACING_MARK
672 if (bStart
|| U_WB_MIDLETTER
!= u_getIntPropertyValue(c
, UCHAR_WORD_BREAK
))
673 return TOKEN_ILLEGAL
;
676 //allowing it to continue the word
677 return (nTypes
& KParseTokens::UNI_OTHER_LETTER
) ?
678 TOKEN_WORD
: TOKEN_ILLEGAL
;
683 return TOKEN_ILLEGAL
;
687 UPT_FLAG_TYPE
cclass_Unicode::getStartCharsFlags( sal_Unicode c
)
691 const sal_Unicode
* pStr
= aStartChars
.getStr();
692 const sal_Unicode
* p
= StrChr( pStr
, c
);
694 return pStart
[ p
- pStr
];
696 return TOKEN_ILLEGAL
;
700 UPT_FLAG_TYPE
cclass_Unicode::getContCharsFlags( sal_Unicode c
)
704 const sal_Unicode
* pStr
= aContChars
.getStr();
705 const sal_Unicode
* p
= StrChr( pStr
, c
);
707 return pCont
[ p
- pStr
];
709 return TOKEN_ILLEGAL
;
713 void cclass_Unicode::parseText( ParseResult
& r
, const OUString
& rText
, sal_Int32 nPos
, sal_Int32 nTokenType
)
715 using namespace i18n
;
716 const sal_Unicode
* const pTextStart
= rText
.getStr() + nPos
;
719 //! All the variables below (plus ParseResult) have to be resetted on ssRewindFromValue!
720 const sal_Unicode
* pSym
= pTextStart
;
721 const sal_Unicode
* pSrc
= pSym
;
723 sal_Unicode c
= *pSrc
;
724 sal_Unicode cLast
= 0;
727 bool bMightBeWord
= true;
728 bool bMightBeWordLast
= true;
729 //! All the variables above (plus ParseResult) have to be resetted on ssRewindFromValue!
731 while ( (c
!= 0) && (eState
!= ssStop
) )
733 UPT_FLAG_TYPE nMask
= getFlags( pTextStart
, pSrc
- pTextStart
);
734 if ( nMask
& TOKEN_EXCLUDED
)
737 { // only relevant for ssGetValue fall back
738 if ( eState
== ssGetChar
|| eState
== ssRewindFromValue
||
739 eState
== ssIgnoreLeadingInRewind
)
740 bMightBeWord
= ((nMask
& TOKEN_CHAR_WORD
) != 0);
742 bMightBeWord
= ((nMask
& TOKEN_WORD
) != 0);
744 sal_Int32 nParseTokensType
= getParseTokensType( pTextStart
, pSrc
- pTextStart
);
749 case ssRewindFromValue
:
750 case ssIgnoreLeadingInRewind
:
752 if ( (nMask
& TOKEN_CHAR_VALUE
) && eState
!= ssRewindFromValue
753 && eState
!= ssIgnoreLeadingInRewind
)
754 { //! must be first, may fall back to ssGetWord via bMightBeWord
756 if ( nMask
& TOKEN_VALUE_DIGIT
)
759 r
.TokenType
= KParseType::UNI_NUMBER
;
761 r
.TokenType
= KParseType::ASC_NUMBER
;
763 else if ( c
== cDecimalSep
)
768 eState
= ssRewindFromValue
;
769 // retry for ONE_SINGLE_CHAR or others
772 else if ( nMask
& TOKEN_CHAR_WORD
)
775 r
.TokenType
= KParseType::IDENTNAME
;
777 else if ( nMask
& TOKEN_NAME_SEP
)
779 eState
= ssGetWordFirstChar
;
782 nParseTokensType
= 0; // will be taken of first real character
783 r
.TokenType
= KParseType::SINGLE_QUOTE_NAME
;
785 else if ( nMask
& TOKEN_CHAR_STRING
)
787 eState
= ssGetString
;
789 nParseTokensType
= 0; // will be taken of first real character
790 r
.TokenType
= KParseType::DOUBLE_QUOTE_STRING
;
792 else if ( nMask
& TOKEN_CHAR_DONTCARE
)
794 if ( nStartTypes
& KParseTokens::IGNORE_LEADING_WS
)
796 if (eState
== ssRewindFromValue
)
797 eState
= ssIgnoreLeadingInRewind
;
798 r
.LeadingWhiteSpace
++;
800 nParseTokensType
= 0; // wait until real character
806 else if ( nMask
& TOKEN_CHAR_BOOL
)
809 r
.TokenType
= KParseType::BOOLEAN
;
811 else if ( nMask
& TOKEN_CHAR
)
814 r
.TokenType
= KParseType::ONE_SINGLE_CHAR
;
817 eState
= ssBounce
; // not known
822 if ( nMask
& TOKEN_VALUE_DIGIT
)
825 r
.TokenType
= KParseType::UNI_NUMBER
;
826 else if ( r
.TokenType
!= KParseType::UNI_NUMBER
)
827 r
.TokenType
= KParseType::ASC_NUMBER
;
829 if ( nMask
& TOKEN_VALUE
)
831 if ( c
== cDecimalSep
&& ++nDecSeps
> 1 )
833 if ( pSrc
- pTextStart
== 2 )
834 eState
= ssRewindFromValue
;
835 // consecutive separators
839 // else keep it going
841 else if ( c
== 'E' || c
== 'e' )
843 UPT_FLAG_TYPE nNext
= getFlags( pTextStart
, pSrc
- pTextStart
);
844 if ( nNext
& TOKEN_VALUE_EXP
)
846 else if ( bMightBeWord
&& ((nNext
& TOKEN_WORD
) || !*pSrc
) )
847 { // might be a numerical name (1.2efg)
849 r
.TokenType
= KParseType::IDENTNAME
;
854 else if ( nMask
& TOKEN_VALUE_SIGN
)
856 if ( (cLast
== 'E') || (cLast
== 'e') )
858 UPT_FLAG_TYPE nNext
= getFlags( pTextStart
, pSrc
- pTextStart
);
859 if ( nNext
& TOKEN_VALUE_EXP_VALUE
)
861 else if ( bMightBeWord
&& ((nNext
& TOKEN_WORD
) || !*pSrc
) )
862 { // might be a numerical name (1.2e+fg)
864 r
.TokenType
= KParseType::IDENTNAME
;
869 else if ( bMightBeWord
)
870 { // might be a numerical name (1.2+fg)
872 r
.TokenType
= KParseType::IDENTNAME
;
877 else if ( bMightBeWord
&& (nMask
& TOKEN_WORD
) )
878 { // might be a numerical name (1995.A1)
880 r
.TokenType
= KParseType::IDENTNAME
;
886 case ssGetWordFirstChar
:
891 if ( nMask
& TOKEN_WORD
)
893 else if ( nMask
& TOKEN_NAME_SEP
)
899 aSymbol
+= OUString( pSym
, pSrc
- pSym
- 2 );
900 aSymbol
+= OUString( &c
, 1);
905 aSymbol
+= OUString( pSym
, pSrc
- pSym
- 1 );
920 if ( nMask
& TOKEN_STRING_SEP
)
924 aSymbol
+= OUString( pSym
, pSrc
- pSym
- 2 );
925 aSymbol
+= OUString( &c
, 1);
927 else if ( c
== *pSrc
&&
928 !(nContTypes
& KParseTokens::TWO_DOUBLE_QUOTES_BREAK_STRING
) )
929 { // "" => literal " escaped
930 aSymbol
+= OUString( pSym
, pSrc
- pSym
);
936 aSymbol
+= OUString( pSym
, pSrc
- pSym
- 1 );
944 if ( (nMask
& TOKEN_BOOL
) )
945 eState
= ssStop
; // maximum 2: <, >, <>, <=, >=
953 ; // nothing, no compiler warning
956 if ( eState
== ssRewindFromValue
)
967 bMightBeWordLast
= true;
971 if ( !(r
.TokenType
& nTokenType
) )
973 if ( (r
.TokenType
& (KParseType::ASC_NUMBER
| KParseType::UNI_NUMBER
))
974 && (nTokenType
& KParseType::IDENTNAME
) && bMightBeWord
)
975 ; // keep a number that might be a word
976 else if ( r
.LeadingWhiteSpace
== (pSrc
- pTextStart
) )
977 ; // keep ignored white space
978 else if ( !r
.TokenType
&& eState
== ssGetValue
&& (nMask
& TOKEN_VALUE_SEP
) )
979 ; // keep uncertain value
983 if ( eState
== ssBounce
)
988 if ( eState
== ssStopBack
)
991 bMightBeWord
= bMightBeWordLast
;
994 if ( eState
!= ssStop
)
997 r
.StartFlags
|= nParseTokensType
;
999 r
.ContFlags
|= nParseTokensType
;
1001 bMightBeWordLast
= bMightBeWord
;
1006 // r.CharLen is the length in characters (not code points) of the parsed
1007 // token not including any leading white space, change this calculation if
1008 // multi-code-point Unicode characters are to be supported.
1009 r
.CharLen
= pSrc
- pTextStart
- r
.LeadingWhiteSpace
;
1010 r
.EndPos
= nPos
+ (pSrc
- pTextStart
);
1011 if ( r
.TokenType
& KParseType::ASC_NUMBER
)
1013 r
.Value
= rtl_math_uStringToDouble( pTextStart
+ r
.LeadingWhiteSpace
,
1014 pTextStart
+ r
.EndPos
, cDecimalSep
, cGroupSep
, NULL
, NULL
);
1016 r
.TokenType
|= KParseType::IDENTNAME
;
1018 else if ( r
.TokenType
& KParseType::UNI_NUMBER
)
1020 if ( !xNatNumSup
.is() )
1022 if ( m_xContext
.is() )
1024 xNatNumSup
= NativeNumberSupplier::create( m_xContext
);
1027 OUString
aTmp( pTextStart
+ r
.LeadingWhiteSpace
, r
.EndPos
- nPos
+
1028 r
.LeadingWhiteSpace
);
1029 // transliterate to ASCII
1030 aTmp
= xNatNumSup
->getNativeNumberString( aTmp
, aParserLocale
,
1031 NativeNumberMode::NATNUM0
);
1032 r
.Value
= ::rtl::math::stringToDouble( aTmp
, cDecimalSep
, cGroupSep
, NULL
, NULL
);
1034 r
.TokenType
|= KParseType::IDENTNAME
;
1036 else if ( r
.TokenType
& (KParseType::SINGLE_QUOTE_NAME
| KParseType::DOUBLE_QUOTE_STRING
) )
1040 aSymbol
+= OUString( pSym
, pSrc
- pSym
);
1041 r
.TokenType
|= KParseType::MISSING_QUOTE
;
1043 r
.DequotedNameOrString
= aSymbol
;
1049 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */