Version 6.4.0.0.beta1, tag libreoffice-6.4.0.0.beta1
[LibreOffice.git] / i18npool / source / characterclassification / cclass_unicode_parser.cxx
blob05ac79ce624c4439d1805101bf48ffc89ba30666
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
21 #include <cclass_unicode.hxx>
22 #include <unicode/uchar.h>
23 #include <rtl/character.hxx>
24 #include <rtl/math.hxx>
25 #include <rtl/ustring.hxx>
26 #include <com/sun/star/i18n/KParseTokens.hpp>
27 #include <com/sun/star/i18n/KParseType.hpp>
28 #include <com/sun/star/i18n/LocaleData2.hpp>
29 #include <com/sun/star/i18n/NativeNumberMode.hpp>
30 #include <com/sun/star/i18n/NativeNumberSupplier.hpp>
32 #include <string.h>
33 #include <string_view>
35 using namespace ::com::sun::star::uno;
36 using namespace ::com::sun::star::i18n;
37 using namespace ::com::sun::star::lang;
39 #define TOKEN_DIGIT_FLAGS (ParserFlags::CHAR_VALUE | ParserFlags::VALUE | ParserFlags::VALUE_EXP | ParserFlags::VALUE_EXP_VALUE | ParserFlags::VALUE_DIGIT)
41 namespace i18npool {
43 // Default identifier/name specification is [A-Za-z_][A-Za-z0-9_]*
45 const sal_uInt8 cclass_Unicode::nDefCnt = 128;
46 const ParserFlags cclass_Unicode::pDefaultParserTable[ nDefCnt ] =
48 // (...) == Calc formula compiler specific, commented out and modified
50 /* \0 */ ParserFlags::EXCLUDED,
51 ParserFlags::ILLEGAL,
52 ParserFlags::ILLEGAL,
53 ParserFlags::ILLEGAL,
54 ParserFlags::ILLEGAL,
55 ParserFlags::ILLEGAL,
56 ParserFlags::ILLEGAL,
57 ParserFlags::ILLEGAL,
58 ParserFlags::ILLEGAL,
59 /* 9 \t */ ParserFlags::CHAR_DONTCARE | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::ILLEGAL)
60 ParserFlags::ILLEGAL,
61 /* 11 \v */ ParserFlags::CHAR_DONTCARE | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::ILLEGAL)
62 ParserFlags::ILLEGAL,
63 ParserFlags::ILLEGAL,
64 ParserFlags::ILLEGAL,
65 ParserFlags::ILLEGAL,
66 ParserFlags::ILLEGAL,
67 ParserFlags::ILLEGAL,
68 ParserFlags::ILLEGAL,
69 ParserFlags::ILLEGAL,
70 ParserFlags::ILLEGAL,
71 ParserFlags::ILLEGAL,
72 ParserFlags::ILLEGAL,
73 ParserFlags::ILLEGAL,
74 ParserFlags::ILLEGAL,
75 ParserFlags::ILLEGAL,
76 ParserFlags::ILLEGAL,
77 ParserFlags::ILLEGAL,
78 ParserFlags::ILLEGAL,
79 ParserFlags::ILLEGAL,
80 ParserFlags::ILLEGAL,
81 ParserFlags::ILLEGAL,
82 /* 32 */ ParserFlags::CHAR_DONTCARE | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP,
83 /* 33 ! */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP,
84 /* 34 " */ ParserFlags::CHAR_STRING | ParserFlags::STRING_SEP,
85 /* 35 # */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::WORD_SEP)
86 /* 36 $ */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::CHAR_WORD | ParserFlags::WORD)
87 /* 37 % */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::VALUE)
88 /* 38 & */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP,
89 /* 39 ' */ ParserFlags::NAME_SEP,
90 /* 40 ( */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP,
91 /* 41 ) */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP,
92 /* 42 * */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP,
93 /* 43 + */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP | ParserFlags::VALUE_EXP | ParserFlags::VALUE_SIGN,
94 /* 44 , */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::CHAR_VALUE | ParserFlags::VALUE)
95 /* 45 - */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP | ParserFlags::VALUE_EXP | ParserFlags::VALUE_SIGN,
96 /* 46 . */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::WORD | ParserFlags::CHAR_VALUE | ParserFlags::VALUE)
97 /* 47 / */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP,
98 //for ( i = 48; i < 58; i++ )
99 /* 48 0 */ TOKEN_DIGIT_FLAGS | ParserFlags::WORD,
100 /* 49 1 */ TOKEN_DIGIT_FLAGS | ParserFlags::WORD,
101 /* 50 2 */ TOKEN_DIGIT_FLAGS | ParserFlags::WORD,
102 /* 51 3 */ TOKEN_DIGIT_FLAGS | ParserFlags::WORD,
103 /* 52 4 */ TOKEN_DIGIT_FLAGS | ParserFlags::WORD,
104 /* 53 5 */ TOKEN_DIGIT_FLAGS | ParserFlags::WORD,
105 /* 54 6 */ TOKEN_DIGIT_FLAGS | ParserFlags::WORD,
106 /* 55 7 */ TOKEN_DIGIT_FLAGS | ParserFlags::WORD,
107 /* 56 8 */ TOKEN_DIGIT_FLAGS | ParserFlags::WORD,
108 /* 57 9 */ TOKEN_DIGIT_FLAGS | ParserFlags::WORD,
109 /* 58 : */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::WORD)
110 /* 59 ; */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP,
111 /* 60 < */ ParserFlags::CHAR_BOOL | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP,
112 /* 61 = */ ParserFlags::CHAR | ParserFlags::BOOL | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP,
113 /* 62 > */ ParserFlags::CHAR_BOOL | ParserFlags::BOOL | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP,
114 /* 63 ? */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::CHAR_WORD | ParserFlags::WORD)
115 /* 64 @ */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::ILLEGAL // UNUSED)
116 //for ( i = 65; i < 91; i++ )
117 /* 65 A */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
118 /* 66 B */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
119 /* 67 C */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
120 /* 68 D */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
121 /* 69 E */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
122 /* 70 F */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
123 /* 71 G */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
124 /* 72 H */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
125 /* 73 I */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
126 /* 74 J */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
127 /* 75 K */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
128 /* 76 L */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
129 /* 77 M */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
130 /* 78 N */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
131 /* 79 O */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
132 /* 80 P */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
133 /* 81 Q */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
134 /* 82 R */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
135 /* 83 S */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
136 /* 84 T */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
137 /* 85 U */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
138 /* 86 V */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
139 /* 87 W */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
140 /* 88 X */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
141 /* 89 Y */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
142 /* 90 Z */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
143 /* 91 [ */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::ILLEGAL // UNUSED)
144 /* 92 \ */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::ILLEGAL // UNUSED)
145 /* 93 ] */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::ILLEGAL // UNUSED)
146 /* 94 ^ */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP,
147 /* 95 _ */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
148 /* 96 ` */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::ILLEGAL // UNUSED)
149 //for ( i = 97; i < 123; i++ )
150 /* 97 a */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
151 /* 98 b */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
152 /* 99 c */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
153 /* 100 d */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
154 /* 101 e */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
155 /* 102 f */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
156 /* 103 g */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
157 /* 104 h */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
158 /* 105 i */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
159 /* 106 j */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
160 /* 107 k */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
161 /* 108 l */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
162 /* 109 m */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
163 /* 110 n */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
164 /* 111 o */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
165 /* 112 p */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
166 /* 113 q */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
167 /* 114 r */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
168 /* 115 s */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
169 /* 116 t */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
170 /* 117 u */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
171 /* 118 v */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
172 /* 119 w */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
173 /* 120 x */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
174 /* 121 y */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
175 /* 122 z */ ParserFlags::CHAR_WORD | ParserFlags::WORD,
176 /* 123 { */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::ILLEGAL // UNUSED)
177 /* 124 | */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::ILLEGAL // UNUSED)
178 /* 125 } */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::ILLEGAL // UNUSED)
179 /* 126 ~ */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::ILLEGAL // UNUSED)
180 /* 127 */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP // (ParserFlags::ILLEGAL // UNUSED)
184 const sal_Int32 cclass_Unicode::pParseTokensType[ nDefCnt ] =
186 /* \0 */ KParseTokens::ASC_OTHER,
187 KParseTokens::ASC_CONTROL,
188 KParseTokens::ASC_CONTROL,
189 KParseTokens::ASC_CONTROL,
190 KParseTokens::ASC_CONTROL,
191 KParseTokens::ASC_CONTROL,
192 KParseTokens::ASC_CONTROL,
193 KParseTokens::ASC_CONTROL,
194 KParseTokens::ASC_CONTROL,
195 /* 9 \t */ KParseTokens::ASC_CONTROL,
196 KParseTokens::ASC_CONTROL,
197 /* 11 \v */ KParseTokens::ASC_CONTROL,
198 KParseTokens::ASC_CONTROL,
199 KParseTokens::ASC_CONTROL,
200 KParseTokens::ASC_CONTROL,
201 KParseTokens::ASC_CONTROL,
202 KParseTokens::ASC_CONTROL,
203 KParseTokens::ASC_CONTROL,
204 KParseTokens::ASC_CONTROL,
205 KParseTokens::ASC_CONTROL,
206 KParseTokens::ASC_CONTROL,
207 KParseTokens::ASC_CONTROL,
208 KParseTokens::ASC_CONTROL,
209 KParseTokens::ASC_CONTROL,
210 KParseTokens::ASC_CONTROL,
211 KParseTokens::ASC_CONTROL,
212 KParseTokens::ASC_CONTROL,
213 KParseTokens::ASC_CONTROL,
214 KParseTokens::ASC_CONTROL,
215 KParseTokens::ASC_CONTROL,
216 KParseTokens::ASC_CONTROL,
217 KParseTokens::ASC_CONTROL,
218 /* 32 */ KParseTokens::ASC_OTHER,
219 /* 33 ! */ KParseTokens::ASC_OTHER,
220 /* 34 " */ KParseTokens::ASC_OTHER,
221 /* 35 # */ KParseTokens::ASC_OTHER,
222 /* 36 $ */ KParseTokens::ASC_DOLLAR,
223 /* 37 % */ KParseTokens::ASC_OTHER,
224 /* 38 & */ KParseTokens::ASC_OTHER,
225 /* 39 ' */ KParseTokens::ASC_OTHER,
226 /* 40 ( */ KParseTokens::ASC_OTHER,
227 /* 41 ) */ KParseTokens::ASC_OTHER,
228 /* 42 * */ KParseTokens::ASC_OTHER,
229 /* 43 + */ KParseTokens::ASC_OTHER,
230 /* 44 , */ KParseTokens::ASC_OTHER,
231 /* 45 - */ KParseTokens::ASC_OTHER,
232 /* 46 . */ KParseTokens::ASC_DOT,
233 /* 47 / */ KParseTokens::ASC_OTHER,
234 //for ( i = 48; i < 58; i++ )
235 /* 48 0 */ KParseTokens::ASC_DIGIT,
236 /* 49 1 */ KParseTokens::ASC_DIGIT,
237 /* 50 2 */ KParseTokens::ASC_DIGIT,
238 /* 51 3 */ KParseTokens::ASC_DIGIT,
239 /* 52 4 */ KParseTokens::ASC_DIGIT,
240 /* 53 5 */ KParseTokens::ASC_DIGIT,
241 /* 54 6 */ KParseTokens::ASC_DIGIT,
242 /* 55 7 */ KParseTokens::ASC_DIGIT,
243 /* 56 8 */ KParseTokens::ASC_DIGIT,
244 /* 57 9 */ KParseTokens::ASC_DIGIT,
245 /* 58 : */ KParseTokens::ASC_COLON,
246 /* 59 ; */ KParseTokens::ASC_OTHER,
247 /* 60 < */ KParseTokens::ASC_OTHER,
248 /* 61 = */ KParseTokens::ASC_OTHER,
249 /* 62 > */ KParseTokens::ASC_OTHER,
250 /* 63 ? */ KParseTokens::ASC_OTHER,
251 /* 64 @ */ KParseTokens::ASC_OTHER,
252 //for ( i = 65; i < 91; i++ )
253 /* 65 A */ KParseTokens::ASC_UPALPHA,
254 /* 66 B */ KParseTokens::ASC_UPALPHA,
255 /* 67 C */ KParseTokens::ASC_UPALPHA,
256 /* 68 D */ KParseTokens::ASC_UPALPHA,
257 /* 69 E */ KParseTokens::ASC_UPALPHA,
258 /* 70 F */ KParseTokens::ASC_UPALPHA,
259 /* 71 G */ KParseTokens::ASC_UPALPHA,
260 /* 72 H */ KParseTokens::ASC_UPALPHA,
261 /* 73 I */ KParseTokens::ASC_UPALPHA,
262 /* 74 J */ KParseTokens::ASC_UPALPHA,
263 /* 75 K */ KParseTokens::ASC_UPALPHA,
264 /* 76 L */ KParseTokens::ASC_UPALPHA,
265 /* 77 M */ KParseTokens::ASC_UPALPHA,
266 /* 78 N */ KParseTokens::ASC_UPALPHA,
267 /* 79 O */ KParseTokens::ASC_UPALPHA,
268 /* 80 P */ KParseTokens::ASC_UPALPHA,
269 /* 81 Q */ KParseTokens::ASC_UPALPHA,
270 /* 82 R */ KParseTokens::ASC_UPALPHA,
271 /* 83 S */ KParseTokens::ASC_UPALPHA,
272 /* 84 T */ KParseTokens::ASC_UPALPHA,
273 /* 85 U */ KParseTokens::ASC_UPALPHA,
274 /* 86 V */ KParseTokens::ASC_UPALPHA,
275 /* 87 W */ KParseTokens::ASC_UPALPHA,
276 /* 88 X */ KParseTokens::ASC_UPALPHA,
277 /* 89 Y */ KParseTokens::ASC_UPALPHA,
278 /* 90 Z */ KParseTokens::ASC_UPALPHA,
279 /* 91 [ */ KParseTokens::ASC_OTHER,
280 /* 92 \ */ KParseTokens::ASC_OTHER,
281 /* 93 ] */ KParseTokens::ASC_OTHER,
282 /* 94 ^ */ KParseTokens::ASC_OTHER,
283 /* 95 _ */ KParseTokens::ASC_UNDERSCORE,
284 /* 96 ` */ KParseTokens::ASC_OTHER,
285 //for ( i = 97; i < 123; i++ )
286 /* 97 a */ KParseTokens::ASC_LOALPHA,
287 /* 98 b */ KParseTokens::ASC_LOALPHA,
288 /* 99 c */ KParseTokens::ASC_LOALPHA,
289 /* 100 d */ KParseTokens::ASC_LOALPHA,
290 /* 101 e */ KParseTokens::ASC_LOALPHA,
291 /* 102 f */ KParseTokens::ASC_LOALPHA,
292 /* 103 g */ KParseTokens::ASC_LOALPHA,
293 /* 104 h */ KParseTokens::ASC_LOALPHA,
294 /* 105 i */ KParseTokens::ASC_LOALPHA,
295 /* 106 j */ KParseTokens::ASC_LOALPHA,
296 /* 107 k */ KParseTokens::ASC_LOALPHA,
297 /* 108 l */ KParseTokens::ASC_LOALPHA,
298 /* 109 m */ KParseTokens::ASC_LOALPHA,
299 /* 110 n */ KParseTokens::ASC_LOALPHA,
300 /* 111 o */ KParseTokens::ASC_LOALPHA,
301 /* 112 p */ KParseTokens::ASC_LOALPHA,
302 /* 113 q */ KParseTokens::ASC_LOALPHA,
303 /* 114 r */ KParseTokens::ASC_LOALPHA,
304 /* 115 s */ KParseTokens::ASC_LOALPHA,
305 /* 116 t */ KParseTokens::ASC_LOALPHA,
306 /* 117 u */ KParseTokens::ASC_LOALPHA,
307 /* 118 v */ KParseTokens::ASC_LOALPHA,
308 /* 119 w */ KParseTokens::ASC_LOALPHA,
309 /* 120 x */ KParseTokens::ASC_LOALPHA,
310 /* 121 y */ KParseTokens::ASC_LOALPHA,
311 /* 122 z */ KParseTokens::ASC_LOALPHA,
312 /* 123 { */ KParseTokens::ASC_OTHER,
313 /* 124 | */ KParseTokens::ASC_OTHER,
314 /* 125 } */ KParseTokens::ASC_OTHER,
315 /* 126 ~ */ KParseTokens::ASC_OTHER,
316 /* 127 */ KParseTokens::ASC_OTHER
320 // static
321 const sal_Unicode* cclass_Unicode::StrChr( const sal_Unicode* pStr, sal_uInt32 c )
323 if ( !pStr )
324 return nullptr;
325 sal_Unicode cs[2];
326 auto const n = rtl::splitSurrogates(c, cs);
327 while ( *pStr )
329 if ( *pStr == cs[0] && (n == 1 || pStr[1] == cs[1]) )
330 return pStr;
331 pStr++;
333 return nullptr;
337 sal_Int32 cclass_Unicode::getParseTokensType(sal_uInt32 const c, bool const isFirst)
339 if ( c < nDefCnt )
340 return pParseTokensType[ sal_uInt8(c) ];
341 else
344 //! all KParseTokens::UNI_... must be matched
345 switch (u_charType(c))
347 case U_UPPERCASE_LETTER :
348 return KParseTokens::UNI_UPALPHA;
349 case U_LOWERCASE_LETTER :
350 return KParseTokens::UNI_LOALPHA;
351 case U_TITLECASE_LETTER :
352 return KParseTokens::UNI_TITLE_ALPHA;
353 case U_MODIFIER_LETTER :
354 return KParseTokens::UNI_MODIFIER_LETTER;
355 case U_OTHER_LETTER :
356 // Non_Spacing_Mark could not be as leading character
357 if (isFirst) break;
358 [[fallthrough]]; // treat it as Other_Letter.
359 case U_NON_SPACING_MARK :
360 return KParseTokens::UNI_OTHER_LETTER;
361 case U_DECIMAL_DIGIT_NUMBER :
362 return KParseTokens::UNI_DIGIT;
363 case U_LETTER_NUMBER :
364 return KParseTokens::UNI_LETTER_NUMBER;
365 case U_OTHER_NUMBER :
366 return KParseTokens::UNI_OTHER_NUMBER;
369 return KParseTokens::UNI_OTHER;
373 void cclass_Unicode::setupInternational( const Locale& rLocale )
375 bool bChanged = (aParserLocale.Language != rLocale.Language
376 || aParserLocale.Country != rLocale.Country
377 || aParserLocale.Variant != rLocale.Variant);
378 if ( bChanged )
380 aParserLocale.Language = rLocale.Language;
381 aParserLocale.Country = rLocale.Country;
382 aParserLocale.Variant = rLocale.Variant;
384 if ( !mxLocaleData.is() )
386 mxLocaleData.set( LocaleData2::create(m_xContext) );
391 void cclass_Unicode::setupParserTable( const Locale& rLocale, sal_Int32 startCharTokenType,
392 const OUString& userDefinedCharactersStart, sal_Int32 contCharTokenType,
393 const OUString& userDefinedCharactersCont )
395 bool bIntlEqual = (rLocale.Language == aParserLocale.Language &&
396 rLocale.Country == aParserLocale.Country &&
397 rLocale.Variant == aParserLocale.Variant);
398 if ( !pTable || !bIntlEqual ||
399 startCharTokenType != nStartTypes ||
400 contCharTokenType != nContTypes ||
401 userDefinedCharactersStart != aStartChars ||
402 userDefinedCharactersCont != aContChars )
403 initParserTable( rLocale, startCharTokenType, userDefinedCharactersStart,
404 contCharTokenType, userDefinedCharactersCont );
408 void cclass_Unicode::initParserTable( const Locale& rLocale, sal_Int32 startCharTokenType,
409 const OUString& userDefinedCharactersStart, sal_Int32 contCharTokenType,
410 const OUString& userDefinedCharactersCont )
412 // (Re)Init
413 setupInternational( rLocale );
414 // Memory of pTable is reused.
415 if ( !pTable )
416 pTable.reset(new ParserFlags[nDefCnt]);
417 memcpy( pTable.get(), pDefaultParserTable, sizeof(ParserFlags) * nDefCnt );
418 // Start and cont tables only need reallocation if different length.
419 if ( pStart && userDefinedCharactersStart.getLength() != aStartChars.getLength() )
421 pStart.reset();
423 if ( pCont && userDefinedCharactersCont.getLength() != aContChars.getLength() )
425 pCont.reset();
427 nStartTypes = startCharTokenType;
428 nContTypes = contCharTokenType;
429 aStartChars = userDefinedCharactersStart;
430 aContChars = userDefinedCharactersCont;
432 // specials
433 if( mxLocaleData.is() )
435 LocaleDataItem2 aItem =
436 mxLocaleData->getLocaleItem2( aParserLocale );
437 //!TODO: theoretically separators may be a string, adjustment would have to be
438 //! done here and in parsing and in ::rtl::math::stringToDouble()
439 cGroupSep = aItem.thousandSeparator[0];
440 cDecimalSep = aItem.decimalSeparator[0];
441 cDecimalSepAlt = aItem.decimalSeparatorAlternative.toChar();
444 if (nContTypes & KParseTokens::GROUP_SEPARATOR_IN_NUMBER)
446 if ( cGroupSep < nDefCnt )
447 pTable[cGroupSep] |= ParserFlags::VALUE;
449 else
451 cGroupSep = 0;
453 if ( cDecimalSep < nDefCnt )
454 pTable[cDecimalSep] |= ParserFlags::CHAR_VALUE | ParserFlags::VALUE;
455 if ( cDecimalSepAlt && cDecimalSepAlt < nDefCnt )
456 pTable[cDecimalSepAlt] |= ParserFlags::CHAR_VALUE | ParserFlags::VALUE;
458 // Modify characters according to KParseTokens definitions.
460 using namespace KParseTokens;
461 sal_uInt8 i;
463 if ( !(nStartTypes & ASC_UPALPHA) )
464 for ( i = 65; i < 91; i++ )
465 pTable[i] &= ~ParserFlags::CHAR_WORD; // not allowed as start character
466 if ( !(nContTypes & ASC_UPALPHA) )
467 for ( i = 65; i < 91; i++ )
468 pTable[i] &= ~ParserFlags::WORD; // not allowed as cont character
470 if ( !(nStartTypes & ASC_LOALPHA) )
471 for ( i = 97; i < 123; i++ )
472 pTable[i] &= ~ParserFlags::CHAR_WORD; // not allowed as start character
473 if ( !(nContTypes & ASC_LOALPHA) )
474 for ( i = 97; i < 123; i++ )
475 pTable[i] &= ~ParserFlags::WORD; // not allowed as cont character
477 if ( nStartTypes & ASC_DIGIT )
478 for ( i = 48; i < 58; i++ )
479 pTable[i] |= ParserFlags::CHAR_WORD; // allowed as start character
480 if ( !(nContTypes & ASC_DIGIT) )
481 for ( i = 48; i < 58; i++ )
482 pTable[i] &= ~ParserFlags::WORD; // not allowed as cont character
484 if ( !(nStartTypes & ASC_UNDERSCORE) )
485 pTable[95] &= ~ParserFlags::CHAR_WORD; // not allowed as start character
486 if ( !(nContTypes & ASC_UNDERSCORE) )
487 pTable[95] &= ~ParserFlags::WORD; // not allowed as cont character
489 if ( nStartTypes & ASC_DOLLAR )
490 pTable[36] |= ParserFlags::CHAR_WORD; // allowed as start character
491 if ( nContTypes & ASC_DOLLAR )
492 pTable[36] |= ParserFlags::WORD; // allowed as cont character
494 if ( nStartTypes & ASC_DOT )
495 pTable[46] |= ParserFlags::CHAR_WORD; // allowed as start character
496 if ( nContTypes & ASC_DOT )
497 pTable[46] |= ParserFlags::WORD; // allowed as cont character
499 if ( nStartTypes & ASC_COLON )
500 pTable[58] |= ParserFlags::CHAR_WORD; // allowed as start character
501 if ( nContTypes & ASC_COLON )
502 pTable[58] |= ParserFlags::WORD; // allowed as cont character
504 if ( nStartTypes & ASC_CONTROL )
505 for ( i = 1; i < 32; i++ )
506 pTable[i] |= ParserFlags::CHAR_WORD; // allowed as start character
507 if ( nContTypes & ASC_CONTROL )
508 for ( i = 1; i < 32; i++ )
509 pTable[i] |= ParserFlags::WORD; // allowed as cont character
511 if ( nStartTypes & ASC_ANY_BUT_CONTROL )
512 for ( i = 32; i < nDefCnt; i++ )
513 pTable[i] |= ParserFlags::CHAR_WORD; // allowed as start character
514 if ( nContTypes & ASC_ANY_BUT_CONTROL )
515 for ( i = 32; i < nDefCnt; i++ )
516 pTable[i] |= ParserFlags::WORD; // allowed as cont character
520 // Merge in (positively override with) user defined characters.
521 // StartChars
522 sal_Int32 nLen = aStartChars.getLength();
523 if ( nLen )
525 if ( !pStart )
526 pStart.reset(new ParserFlags[ nLen ]);
527 const sal_Unicode* p = aStartChars.getStr();
528 for ( sal_Int32 j=0; j<nLen; j++, p++ )
530 pStart[j] = ParserFlags::CHAR_WORD;
531 if ( *p < nDefCnt )
532 pTable[*p] |= ParserFlags::CHAR_WORD;
535 // ContChars
536 nLen = aContChars.getLength();
537 if ( nLen )
539 if ( !pCont )
540 pCont.reset(new ParserFlags[ nLen ]);
541 const sal_Unicode* p = aContChars.getStr();
542 for ( sal_Int32 j=0; j<nLen; j++ )
544 pCont[j] = ParserFlags::WORD;
545 if ( *p < nDefCnt )
546 pTable[*p] |= ParserFlags::WORD;
552 void cclass_Unicode::destroyParserTable()
554 pCont.reset();
555 pStart.reset();
556 pTable.reset();
560 ParserFlags cclass_Unicode::getFlags(sal_uInt32 const c)
562 ParserFlags nMask;
563 if ( c < nDefCnt )
564 nMask = pTable[ sal_uInt8(c) ];
565 else
566 nMask = getFlagsExtended(c);
567 switch ( eState )
569 case ssGetChar :
570 case ssRewindFromValue :
571 case ssIgnoreLeadingInRewind :
572 case ssGetWordFirstChar :
573 if ( !(nMask & ParserFlags::CHAR_WORD) )
575 nMask |= getStartCharsFlags( c );
576 if ( nMask & ParserFlags::CHAR_WORD )
577 nMask &= ~ParserFlags::EXCLUDED;
579 break;
580 case ssGetValue :
581 case ssGetWord :
582 if ( !(nMask & ParserFlags::WORD) )
584 nMask |= getContCharsFlags( c );
585 if ( nMask & ParserFlags::WORD )
586 nMask &= ~ParserFlags::EXCLUDED;
588 break;
589 default:
590 ; // other cases aren't needed, no compiler warning
592 return nMask;
596 ParserFlags cclass_Unicode::getFlagsExtended(sal_uInt32 const c)
598 if ( c == cGroupSep )
599 return ParserFlags::VALUE;
600 else if ( c == cDecimalSep )
601 return ParserFlags::CHAR_VALUE | ParserFlags::VALUE;
602 else if ( cDecimalSepAlt && c == cDecimalSepAlt )
603 return ParserFlags::CHAR_VALUE | ParserFlags::VALUE;
604 bool bStart = (eState == ssGetChar || eState == ssGetWordFirstChar ||
605 eState == ssRewindFromValue || eState == ssIgnoreLeadingInRewind);
606 sal_Int32 nTypes = (bStart ? nStartTypes : nContTypes);
608 //! all KParseTokens::UNI_... must be matched
609 switch (u_charType(c))
611 case U_UPPERCASE_LETTER :
612 return (nTypes & KParseTokens::UNI_UPALPHA) ?
613 (bStart ? ParserFlags::CHAR_WORD : ParserFlags::WORD) :
614 ParserFlags::ILLEGAL;
615 case U_LOWERCASE_LETTER :
616 return (nTypes & KParseTokens::UNI_LOALPHA) ?
617 (bStart ? ParserFlags::CHAR_WORD : ParserFlags::WORD) :
618 ParserFlags::ILLEGAL;
619 case U_TITLECASE_LETTER :
620 return (nTypes & KParseTokens::UNI_TITLE_ALPHA) ?
621 (bStart ? ParserFlags::CHAR_WORD : ParserFlags::WORD) :
622 ParserFlags::ILLEGAL;
623 case U_MODIFIER_LETTER :
624 return (nTypes & KParseTokens::UNI_MODIFIER_LETTER) ?
625 (bStart ? ParserFlags::CHAR_WORD : ParserFlags::WORD) :
626 ParserFlags::ILLEGAL;
627 case U_NON_SPACING_MARK :
628 case U_COMBINING_SPACING_MARK :
629 // Non_Spacing_Mark can't be a leading character,
630 // nor can a spacing combining mark.
631 if (bStart)
632 return ParserFlags::ILLEGAL;
633 [[fallthrough]]; // treat it as Other_Letter.
634 case U_OTHER_LETTER :
635 return (nTypes & KParseTokens::UNI_OTHER_LETTER) ?
636 (bStart ? ParserFlags::CHAR_WORD : ParserFlags::WORD) :
637 ParserFlags::ILLEGAL;
638 case U_DECIMAL_DIGIT_NUMBER :
639 return ((nTypes & KParseTokens::UNI_DIGIT) ?
640 (bStart ? ParserFlags::CHAR_WORD : ParserFlags::WORD) :
641 ParserFlags::ILLEGAL) | TOKEN_DIGIT_FLAGS;
642 case U_LETTER_NUMBER :
643 return ((nTypes & KParseTokens::UNI_LETTER_NUMBER) ?
644 (bStart ? ParserFlags::CHAR_WORD : ParserFlags::WORD) :
645 ParserFlags::ILLEGAL) | TOKEN_DIGIT_FLAGS;
646 case U_OTHER_NUMBER :
647 return ((nTypes & KParseTokens::UNI_OTHER_NUMBER) ?
648 (bStart ? ParserFlags::CHAR_WORD : ParserFlags::WORD) :
649 ParserFlags::ILLEGAL) | TOKEN_DIGIT_FLAGS;
650 case U_SPACE_SEPARATOR :
651 return ((nTypes & KParseTokens::IGNORE_LEADING_WS) ?
652 ParserFlags::CHAR_DONTCARE : (bStart ? ParserFlags::CHAR_WORD : (ParserFlags::CHAR_DONTCARE | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP) ));
653 case U_OTHER_PUNCTUATION:
654 // fdo#61754 Lets see (if we not at the start) if this is midletter
655 // punctuation and allow it in a word if it is similarly to
656 // U_NON_SPACING_MARK, for example U+00B7 MIDDLE DOT.
657 // tdf#123575 for U+30FB KATAKANA MIDDLE DOT property is not
658 // U_WB_MIDLETTER but U_WB_KATAKANA instead, explicitly test that
659 // and U+FF65 HALFWIDTH KATAKANA MIDDLE DOT.
660 if (bStart || (U_WB_MIDLETTER != u_getIntPropertyValue(c, UCHAR_WORD_BREAK)
661 && c != 0x30FB && c != 0xFF65))
662 return ParserFlags::ILLEGAL;
663 else
665 //allowing it to continue the word
666 return (nTypes & KParseTokens::UNI_OTHER_LETTER) ?
667 ParserFlags::WORD : ParserFlags::ILLEGAL;
669 break;
672 return ParserFlags::ILLEGAL;
676 ParserFlags cclass_Unicode::getStartCharsFlags( sal_uInt32 c )
678 if ( pStart )
680 const sal_Unicode* pStr = aStartChars.getStr();
681 const sal_Unicode* p = StrChr( pStr, c );
682 if ( p )
683 return pStart[ p - pStr ];
685 return ParserFlags::ILLEGAL;
689 ParserFlags cclass_Unicode::getContCharsFlags( sal_Unicode c )
691 if ( pCont )
693 const sal_Unicode* pStr = aContChars.getStr();
694 const sal_Unicode* p = StrChr( pStr, c );
695 if ( p )
696 return pCont[ p - pStr ];
698 return ParserFlags::ILLEGAL;
702 void cclass_Unicode::parseText( ParseResult& r, const OUString& rText, sal_Int32 nPos, sal_Int32 nTokenType )
704 assert(r.LeadingWhiteSpace == 0);
705 eState = ssGetChar;
707 //! All the variables below (plus ParseResult) have to be reset on ssRewindFromValue!
708 OUStringBuffer aSymbol;
709 bool isFirst(true);
710 sal_Int32 index(nPos); // index of next code point after current
711 sal_Int32 postSymbolIndex(index); // index of code point following last quote
712 sal_uInt32 current((index < rText.getLength()) ? rText.iterateCodePoints(&index) : 0);
713 sal_uInt32 cLast = 0;
714 sal_Int32 nCodePoints(0);
715 int nDecSeps = 0;
716 bool bQuote = false;
717 bool bMightBeWord = true;
718 bool bMightBeWordLast = true;
719 bool bDecSepAltUsed = false;
720 //! All the variables above (plus ParseResult) have to be reset on ssRewindFromValue!
721 sal_Int32 nextCharIndex(nPos); // == index of nextChar
723 while ((current != 0) && (eState != ssStop))
725 ++nCodePoints;
726 ParserFlags nMask = getFlags(current);
727 if ( nMask & ParserFlags::EXCLUDED )
728 eState = ssBounce;
729 if ( bMightBeWord )
730 { // only relevant for ssGetValue fall back
731 if ( eState == ssGetChar || eState == ssRewindFromValue ||
732 eState == ssIgnoreLeadingInRewind )
733 bMightBeWord = bool(nMask & ParserFlags::CHAR_WORD);
734 else
735 bMightBeWord = bool(nMask & ParserFlags::WORD);
737 sal_Int32 nParseTokensType = getParseTokensType(current, isFirst);
738 isFirst = false;
739 sal_Int32 const nextIndex(nextCharIndex); // == index of char following current
740 nextCharIndex = index; // == index of nextChar
741 sal_uInt32 nextChar((index < rText.getLength()) ? rText.iterateCodePoints(&index) : 0);
742 switch (eState)
744 case ssGetChar :
745 case ssRewindFromValue :
746 case ssIgnoreLeadingInRewind :
748 if ( (nMask & ParserFlags::CHAR_VALUE) && eState != ssRewindFromValue
749 && eState != ssIgnoreLeadingInRewind )
750 { //! must be first, may fall back to ssGetWord via bMightBeWord
751 eState = ssGetValue;
752 if ( nMask & ParserFlags::VALUE_DIGIT )
754 if (128 <= current)
755 r.TokenType = KParseType::UNI_NUMBER;
756 else
757 r.TokenType = KParseType::ASC_NUMBER;
759 else if (current == cDecimalSep || (bDecSepAltUsed = (cDecimalSepAlt && current == cDecimalSepAlt)))
761 if (nextChar)
762 ++nDecSeps;
763 else
764 eState = ssRewindFromValue;
765 // retry for ONE_SINGLE_CHAR or others
768 else if ( nMask & ParserFlags::CHAR_WORD )
770 eState = ssGetWord;
771 r.TokenType = KParseType::IDENTNAME;
773 else if ( nMask & ParserFlags::NAME_SEP )
775 eState = ssGetWordFirstChar;
776 bQuote = true;
777 postSymbolIndex = nextCharIndex;
778 nParseTokensType = 0; // will be taken of first real character
779 r.TokenType = KParseType::SINGLE_QUOTE_NAME;
781 else if ( nMask & ParserFlags::CHAR_STRING )
783 eState = ssGetString;
784 postSymbolIndex = nextCharIndex;
785 nParseTokensType = 0; // will be taken of first real character
786 r.TokenType = KParseType::DOUBLE_QUOTE_STRING;
788 else if ( nMask & ParserFlags::CHAR_DONTCARE )
790 if ( nStartTypes & KParseTokens::IGNORE_LEADING_WS )
792 if (eState == ssRewindFromValue)
793 eState = ssIgnoreLeadingInRewind;
794 r.LeadingWhiteSpace = nextCharIndex - nPos;
795 nCodePoints--; // exclude leading whitespace
796 postSymbolIndex = nextCharIndex;
797 nParseTokensType = 0; // wait until real character
798 bMightBeWord = true;
800 else
801 eState = ssBounce;
803 else if ( nMask & ParserFlags::CHAR_BOOL )
805 eState = ssGetBool;
806 r.TokenType = KParseType::BOOLEAN;
808 else if ( nMask & ParserFlags::CHAR )
809 { //! must be last
810 eState = ssStop;
811 r.TokenType = KParseType::ONE_SINGLE_CHAR;
813 else
814 eState = ssBounce; // not known
816 break;
817 case ssGetValue :
819 if ( nMask & ParserFlags::VALUE_DIGIT )
821 if (128 <= current)
822 r.TokenType = KParseType::UNI_NUMBER;
823 else if ( r.TokenType != KParseType::UNI_NUMBER )
824 r.TokenType = KParseType::ASC_NUMBER;
826 if ( nMask & ParserFlags::VALUE )
828 if (current == cGroupSep)
830 if (getFlags(nextChar) & ParserFlags::VALUE_DIGIT)
831 nParseTokensType |= KParseTokens::GROUP_SEPARATOR_IN_NUMBER;
832 else
834 // Trailing group separator character is not a
835 // group separator.
836 eState = ssStopBack;
839 else if ((current == cDecimalSep ||
840 (bDecSepAltUsed = (cDecimalSepAlt && current == cDecimalSepAlt))) &&
841 ++nDecSeps > 1)
843 if (nCodePoints == 2)
844 eState = ssRewindFromValue;
845 // consecutive separators
846 else
847 eState = ssStopBack;
849 // else keep it going
851 else if (current == 'E' || current == 'e')
853 ParserFlags nNext = getFlags(nextChar);
854 if ( nNext & ParserFlags::VALUE_EXP )
855 ; // keep it going
856 else if (bMightBeWord && ((nNext & ParserFlags::WORD) || !nextChar))
857 { // might be a numerical name (1.2efg)
858 eState = ssGetWord;
859 r.TokenType = KParseType::IDENTNAME;
861 else
862 eState = ssStopBack;
864 else if ( nMask & ParserFlags::VALUE_SIGN )
866 if ( (cLast == 'E') || (cLast == 'e') )
868 ParserFlags nNext = getFlags(nextChar);
869 if ( nNext & ParserFlags::VALUE_EXP_VALUE )
870 ; // keep it going
871 else if (bMightBeWord && ((nNext & ParserFlags::WORD) || !nextChar))
872 { // might be a numerical name (1.2e+fg)
873 eState = ssGetWord;
874 r.TokenType = KParseType::IDENTNAME;
876 else
877 eState = ssStopBack;
879 else if ( bMightBeWord )
880 { // might be a numerical name (1.2+fg)
881 eState = ssGetWord;
882 r.TokenType = KParseType::IDENTNAME;
884 else
885 eState = ssStopBack;
887 else if ( bMightBeWord && (nMask & ParserFlags::WORD) )
888 { // might be a numerical name (1995.A1)
889 eState = ssGetWord;
890 r.TokenType = KParseType::IDENTNAME;
892 else
893 eState = ssStopBack;
895 break;
896 case ssGetWordFirstChar :
897 eState = ssGetWord;
898 [[fallthrough]];
899 case ssGetWord :
901 if ( nMask & ParserFlags::WORD )
902 ; // keep it going
903 else if ( nMask & ParserFlags::NAME_SEP )
905 if ( bQuote )
907 if ( cLast == '\\' )
908 { // escaped
909 aSymbol.append(std::u16string_view(rText).substr(postSymbolIndex, nextCharIndex - postSymbolIndex - 2));
910 aSymbol.append(OUString(&current, 1));
912 else
914 eState = ssStop;
915 aSymbol.append(std::u16string_view(rText).substr(postSymbolIndex, nextCharIndex - postSymbolIndex - 1));
917 postSymbolIndex = nextCharIndex;
919 else
920 eState = ssStopBack;
922 else if ( bQuote )
923 ; // keep it going
924 else
925 eState = ssStopBack;
927 break;
928 case ssGetString :
930 if ( nMask & ParserFlags::STRING_SEP )
932 if ( cLast == '\\' )
933 { // escaped
934 aSymbol.append(std::u16string_view(rText).substr(postSymbolIndex, nextCharIndex - postSymbolIndex - 2));
935 aSymbol.append(OUString(&current, 1));
937 else if (current == nextChar &&
938 !(nContTypes & KParseTokens::TWO_DOUBLE_QUOTES_BREAK_STRING) )
939 { // "" => literal " escaped
940 aSymbol.append(std::u16string_view(rText).substr(postSymbolIndex, nextCharIndex - postSymbolIndex));
941 nextCharIndex = index;
942 if (index < rText.getLength()) { ++nCodePoints; }
943 nextChar = (index < rText.getLength()) ? rText.iterateCodePoints(&index) : 0;
945 else
947 eState = ssStop;
948 aSymbol.append(std::u16string_view(rText).substr(postSymbolIndex, nextCharIndex - postSymbolIndex - 1));
950 postSymbolIndex = nextCharIndex;
953 break;
954 case ssGetBool :
956 if ( nMask & ParserFlags::BOOL )
957 eState = ssStop; // maximum 2: <, >, <>, <=, >=
958 else
959 eState = ssStopBack;
961 break;
962 case ssStopBack :
963 case ssBounce :
964 case ssStop :
965 ; // nothing, no compiler warning
966 break;
968 if ( eState == ssRewindFromValue )
970 r = ParseResult();
971 index = nPos;
972 postSymbolIndex = nPos;
973 nextCharIndex = nPos;
974 aSymbol.setLength(0);
975 current = (index < rText.getLength()) ? rText.iterateCodePoints(&index) : 0;
976 nCodePoints = (nPos < rText.getLength()) ? 1 : 0;
977 isFirst = true;
978 cLast = 0;
979 nDecSeps = 0;
980 bQuote = false;
981 bMightBeWord = true;
982 bMightBeWordLast = true;
983 bDecSepAltUsed = false;
985 else
987 if ( !(r.TokenType & nTokenType) )
989 if ( (r.TokenType & (KParseType::ASC_NUMBER | KParseType::UNI_NUMBER))
990 && (nTokenType & KParseType::IDENTNAME) && bMightBeWord )
991 ; // keep a number that might be a word
992 else if (r.LeadingWhiteSpace == (nextCharIndex - nPos))
993 ; // keep ignored white space
994 else if ( !r.TokenType && eState == ssGetValue && (nMask & ParserFlags::VALUE_SEP) )
995 ; // keep uncertain value
996 else
997 eState = ssBounce;
999 if ( eState == ssBounce )
1001 r.TokenType = 0;
1002 eState = ssStopBack;
1004 if ( eState == ssStopBack )
1005 { // put back
1006 nextChar = rText.iterateCodePoints(&index, -1);
1007 nextCharIndex = nextIndex;
1008 --nCodePoints;
1009 bMightBeWord = bMightBeWordLast;
1010 eState = ssStop;
1012 if ( eState != ssStop )
1014 if ( !r.StartFlags )
1015 r.StartFlags |= nParseTokensType;
1016 else
1017 r.ContFlags |= nParseTokensType;
1019 bMightBeWordLast = bMightBeWord;
1020 cLast = current;
1021 current = nextChar;
1024 // r.CharLen is the length in characters (not code units) of the parsed
1025 // token not including any leading white space.
1026 r.CharLen = nCodePoints;
1027 r.EndPos = nextCharIndex;
1028 if ( r.TokenType & KParseType::ASC_NUMBER )
1030 r.Value = rtl_math_uStringToDouble(rText.getStr() + nPos + r.LeadingWhiteSpace,
1031 rText.getStr() + r.EndPos, (bDecSepAltUsed ? cDecimalSepAlt : cDecimalSep), cGroupSep, nullptr, nullptr);
1032 if ( bMightBeWord )
1033 r.TokenType |= KParseType::IDENTNAME;
1035 else if ( r.TokenType & KParseType::UNI_NUMBER )
1037 if ( !xNatNumSup.is() )
1039 if ( m_xContext.is() )
1041 xNatNumSup = NativeNumberSupplier::create( m_xContext );
1044 OUString aTmp(rText.getStr() + nPos + r.LeadingWhiteSpace,
1045 r.EndPos - nPos - r.LeadingWhiteSpace);
1046 // transliterate to ASCII
1047 aTmp = xNatNumSup->getNativeNumberString( aTmp, aParserLocale,
1048 NativeNumberMode::NATNUM0 );
1049 r.Value = ::rtl::math::stringToDouble( aTmp, cDecimalSep, cGroupSep );
1050 if ( bMightBeWord )
1051 r.TokenType |= KParseType::IDENTNAME;
1053 else if ( r.TokenType & (KParseType::SINGLE_QUOTE_NAME | KParseType::DOUBLE_QUOTE_STRING) )
1055 if (postSymbolIndex < nextCharIndex)
1056 { //! open quote
1057 aSymbol.append(std::u16string_view(rText).substr(postSymbolIndex, nextCharIndex - postSymbolIndex - 1));
1058 r.TokenType |= KParseType::MISSING_QUOTE;
1060 r.DequotedNameOrString = aSymbol.toString();
1066 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */