1 // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "encodings/compact_lang_det/getonescriptspan.h"
9 #include "base/basictypes.h"
10 #include "encodings/lang_enc.h"
11 #include "encodings/compact_lang_det/utf8propjustletter.h"
12 #include "encodings/compact_lang_det/utf8propletterscriptnum.h"
13 #include "encodings/compact_lang_det/utf8scannotjustletterspecial.h"
15 #include "encodings/compact_lang_det/win/cld_basictypes.h"
16 #include "encodings/compact_lang_det/win/cld_commandlineflags.h"
17 #include "encodings/compact_lang_det/win/cld_google.h"
18 #include "encodings/compact_lang_det/win/cld_htmlutils.h"
19 #include "encodings/compact_lang_det/win/cld_unilib.h"
20 #include "encodings/compact_lang_det/win/cld_utf8statetable.h"
21 #include "encodings/compact_lang_det/win/cld_utf8utils.h"
23 static const Language GRAY_LANG
= (Language
)254;
25 static const int kMaxUpToWordBoundary
= 50; // span < this make longer,
27 static const int kMaxAdvanceToWordBoundary
= 10; // +/- this many bytes
28 // to round to word boundary,
31 static const char kSpecialSymbol
[256] = { // true for < > &
32 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
33 0,0,0,0,0,0,1,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,1,0,1,0,
34 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
35 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
37 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
38 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
39 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
40 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
61 #define CR 16 // <cr> or <lf>
62 #define NL 17 // non-letter: ASCII whitespace, digit, punctuation
63 #define PL 18 // possible letter, incl. &
64 #define xx 19 // <unused>
66 // Map byte to one of ~20 interesting categories for cheap tag parsing
67 static const uint8 kCharToSub
[256] = {
68 NL
,NL
,NL
,NL
, NL
,NL
,NL
,NL
, NL
,NL
,CR
,NL
, NL
,CR
,NL
,NL
,
69 NL
,NL
,NL
,NL
, NL
,NL
,NL
,NL
, NL
,NL
,NL
,NL
, NL
,NL
,NL
,NL
,
70 NL
,EX
,QU
,NL
, NL
,NL
,PL
,AP
, NL
,NL
,NL
,NL
, NL
,HY
,NL
,SL
,
71 NL
,NL
,NL
,NL
, NL
,NL
,NL
,NL
, NL
,NL
,NL
,NL
, LT
,NL
,GT
,NL
,
73 PL
,PL
,PL
,C_
, PL
,E_
,PL
,PL
, PL
,I_
,PL
,PL
, L_
,PL
,PL
,PL
,
74 P_
,PL
,R_
,S_
, T_
,PL
,PL
,PL
, PL
,Y_
,PL
,NL
, NL
,NL
,NL
,NL
,
75 PL
,PL
,PL
,C_
, PL
,E_
,PL
,PL
, PL
,I_
,PL
,PL
, L_
,PL
,PL
,PL
,
76 P_
,PL
,R_
,S_
, T_
,PL
,PL
,PL
, PL
,Y_
,PL
,NL
, NL
,NL
,NL
,NL
,
78 NL
,NL
,NL
,NL
, NL
,NL
,NL
,NL
, NL
,NL
,NL
,NL
, NL
,NL
,NL
,NL
,
79 NL
,NL
,NL
,NL
, NL
,NL
,NL
,NL
, NL
,NL
,NL
,NL
, NL
,NL
,NL
,NL
,
80 NL
,NL
,NL
,NL
, NL
,NL
,NL
,NL
, NL
,NL
,NL
,NL
, NL
,NL
,NL
,NL
,
81 NL
,NL
,NL
,NL
, NL
,NL
,NL
,NL
, NL
,NL
,NL
,NL
, NL
,NL
,NL
,NL
,
83 PL
,PL
,PL
,PL
, PL
,PL
,PL
,PL
, PL
,PL
,PL
,PL
, PL
,PL
,PL
,PL
,
84 PL
,PL
,PL
,PL
, PL
,PL
,PL
,PL
, PL
,PL
,PL
,PL
, PL
,PL
,PL
,PL
,
85 PL
,PL
,PL
,PL
, PL
,PL
,PL
,PL
, PL
,PL
,PL
,PL
, PL
,PL
,PL
,PL
,
86 PL
,PL
,PL
,PL
, PL
,PL
,PL
,PL
, PL
,PL
,PL
,PL
, PL
,PL
,PL
,PL
,
114 // State machine to do cheap parse of non-letter strings incl. tags
117 // advances <tag> ... </tag> for <script> <style>
119 // advances <!-- ... <tag> ... -->
123 // advances <tag <tag2>
125 static const uint8 kTagParseTbl_0
[] = {
126 // < > ! - " ' / S C R I P T Y L E CR NL PL xx
127 3, 2, 2, 2, 2, 2, 2,OK
, OK
,OK
,OK
,OK
, OK
,OK
,OK
,OK
, 2, 2,OK
,X_
, // [0] OK
128 X_
,X_
,X_
,X_
, X_
,X_
,X_
,X_
, X_
,X_
,X_
,X_
, X_
,X_
,X_
,X_
, X_
,X_
,X_
,X_
, // [1] error
129 3, 2, 2, 2, 2, 2, 2,OK
, OK
,OK
,OK
,OK
, OK
,OK
,OK
,OK
, 2, 2,OK
,X_
, // [2] NL*
130 X_
, 2, 4, 9, 10,11, 9,13, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_
, // [3] <
131 X_
, 2, 9, 5, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_
, // [4] <!
132 X_
, 2, 9, 6, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_
, // [5] <!-
133 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,X_
, // [6] <!--.*
134 6, 6, 6, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,X_
, // [7] <!--.*-
135 6, 2, 6, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,X_
, // [8] <!--.*--
136 X_
, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_
, // [9] <.*
137 10,10,10,10, 9,10,10,10, 10,10,10,10, 10,10,10,10, 12,10,10,X_
, // [10] <.*"
138 11,11,11,11, 11, 9,11,11, 11,11,11,11, 11,11,11,11, 12,11,11,X_
, // [11] <.*'
139 X_
, 2,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,X_
, // [12] <.* no " '
141 // < > ! - " ' / S C R I P T Y L E CR NL PL xx
142 X_
, 2, 9, 9, 10,11, 9, 9, 14, 9, 9, 9, 28, 9, 9, 9, 9, 9, 9,X_
, // [13] <S
143 X_
, 2, 9, 9, 10,11, 9, 9, 9,15, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_
, // [14] <SC
144 X_
, 2, 9, 9, 10,11, 9, 9, 9, 9,16, 9, 9, 9, 9, 9, 9, 9, 9,X_
, // [15] <SCR
145 X_
, 2, 9, 9, 10,11, 9, 9, 9, 9, 9,17, 9, 9, 9, 9, 9, 9, 9,X_
, // [16] <SCRI
146 X_
, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 18, 9, 9, 9, 9, 9, 9,X_
, // [17] <SCRIP
147 X_
,19, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 19,19, 9,X_
, // [18] <SCRIPT
148 20,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_
, // [19] <SCRIPT .*
149 19,19,19,19, 19,19,21,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_
, // [20] <SCRIPT .*<
150 19,19,19,19, 19,19,19,22, 19,19,19,19, 19,19,19,19, 19,19,19,X_
, // [21] <SCRIPT .*</
151 19,19,19,19, 19,19,19,19, 23,19,19,19, 19,19,19,19, 19,19,19,X_
, // [22] <SCRIPT .*</S
152 19,19,19,19, 19,19,19,19, 19,24,19,19, 19,19,19,19, 19,19,19,X_
, // [23] <SCRIPT .*</SC
153 19,19,19,19, 19,19,19,19, 19,19,25,19, 19,19,19,19, 19,19,19,X_
, // [24] <SCRIPT .*</SCR
154 19,19,19,19, 19,19,19,19, 19,19,19,26, 19,19,19,19, 19,19,19,X_
, // [25] <SCRIPT .*</SCRI
155 19,19,19,19, 19,19,19,19, 19,19,19,19, 27,19,19,19, 19,19,19,X_
, // [26] <SCRIPT .*</SCRIP
156 19, 2,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_
, // [27] <SCRIPT .*</SCRIPT
158 // < > ! - " ' / S C R I P T Y L E CR NL PL xx
159 X_
, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9,29, 9, 9, 9, 9, 9,X_
, // [28] <ST
160 X_
, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9,30, 9, 9, 9, 9,X_
, // [29] <STY
161 X_
, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9,31, 9, 9, 9,X_
, // [30] <STYL
162 X_
,32, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 32,32, 9,X_
, // [31] <STYLE
163 33,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_
, // [32] <STYLE .*
164 32,32,32,32, 32,32,34,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_
, // [33] <STYLE .*<
165 32,32,32,32, 32,32,32,35, 32,32,32,32, 32,32,32,32, 32,32,32,X_
, // [34] <STYLE .*</
166 32,32,32,32, 32,32,32,32, 32,32,32,32, 36,32,32,32, 32,32,32,X_
, // [35] <STYLE .*</S
167 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,37,32,32, 32,32,32,X_
, // [36] <STYLE .*</ST
168 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,38,32, 32,32,32,X_
, // [37] <STYLE .*</STY
169 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,39, 32,32,32,X_
, // [38] <STYLE .*</STYL
170 32, 2,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_
, // [39] <STYLE .*</STYLE
178 // Convert GetTimeOfDay output to 64-bit usec
179 static inline uint64 Microseconds(const struct timeval& t) {
180 // The SumReducer uses uint64, so convert to (uint64) microseconds,
181 // not (double) seconds.
182 return t.tv_sec * 1000000ULL + t.tv_usec;
187 // Returns true if character is < > or &
188 bool inline IsSpecial(char c
) {
189 if ((c
& 0xe0) == 0x20) {
190 return kSpecialSymbol
[static_cast<uint8
>(c
)];
195 // Quick Skip to next letter or < > & or to end of string (eos)
196 // Always return is_letter for eos
197 int ScanToLetterOrSpecial(const char* src
, int len
) {
199 cld::UTF8GenericScan(&utf8scannotjustletterspecial_obj
, src
, len
,
201 return bytes_consumed
;
206 // src points to non-letter, such as tag-opening '<'
207 // Return length from here to next possible letter
208 // On eos or another < before >, return 1
211 // advances <tag> ... </tag> for <script> <style>
213 // advances <!-- ... <tag> ... -->
217 // advances <tag <tag2>
219 int ScanToPossibleLetter(const char* isrc
, int len
) {
220 const uint8
* src
= reinterpret_cast<const uint8
*>(isrc
);
221 const uint8
* srclimit
= src
+ len
;
222 const uint8
* tagParseTbl
= kTagParseTbl_0
;
224 while (src
< srclimit
) {
225 e
= tagParseTbl
[kCharToSub
[*src
++]];
227 // We overshot by one byte
231 tagParseTbl
= &kTagParseTbl_0
[e
* 20];
234 if (src
>= srclimit
) {
235 // We fell off the end of the text.
236 // It looks like the most common case for this is a truncated file, not
237 // mismatched angle brackets. So we pretend that the last char was '>'
241 // OK to be in state 0 or state 2 at exit
242 if ((e
!= 0) && (e
!= 2)) {
243 // Error, '<' followed by '<'
244 // We want to back up to first <, then advance by one byte past it
245 int offset
= src
- reinterpret_cast<const uint8
*>(isrc
);
246 // printf("ScanToPossibleLetter error at %d[%d] in '%s'\n",offset,e,isrc);
248 // Backscan to first '<' and return enough length to just get past it
249 --offset
; // back up over the second '<', which caused us to stop
250 while ((0 < offset
) && (isrc
[offset
] != '<')) {
251 // Find the first '<', which is unmatched
254 // skip to just beyond first '<'
255 // printf(" returning %d\n", offset + 1);
259 return src
- reinterpret_cast<const uint8
*>(isrc
);
264 ScriptScanner::ScriptScanner(const char* buffer
,
267 : start_byte_(buffer
),
269 next_byte_limit_(buffer
+ buffer_length
),
270 byte_length_(buffer_length
),
271 is_plain_text_(is_plain_text
) {
272 script_buffer_
= new char[getone::kMaxScriptBuffer
];
273 script_buffer_lower_
= new char[getone::kMaxScriptLowerBuffer
];
276 ScriptScanner::~ScriptScanner() {
277 delete[] script_buffer_
;
278 delete[] script_buffer_lower_
;
284 // Get to the first real non-tag letter or entity that is a letter
285 // Sets script of that letter
286 // Return len if no more letters
287 int ScriptScanner::SkipToFrontOfSpan(const char* src
, int len
, int* script
) {
288 int sc
= UNKNOWN_LSCRIPT
;
292 // Do run of non-letters (tag | &NL | NL)*
294 // Do fast scan to next interesting byte
295 // int oldskip = skip;
296 skip
+= ScanToLetterOrSpecial(src
+ skip
, len
- skip
);
298 // printf("ScanToLetterOrSpecial[%d] 0x%02x => [%d] 0x%02x\n",
299 // oldskip, src[oldskip], skip, src[skip]);
301 // Check for no more letters/specials
307 // We are at a letter, nonletter, tag, or entity
308 if (IsSpecial(src
[skip
]) && !is_plain_text_
) {
309 if (src
[skip
] == '<') {
310 // Begining of tag; skip to end and go around again
311 tlen
= ScanToPossibleLetter(src
+ skip
, len
- skip
);
314 } else if (src
[skip
] == '>') {
315 // Unexpected end of tag; skip it and go around again
316 tlen
= 1; // Over the >
319 } else if (src
[skip
] == '&') {
320 // Expand entity, no advance
322 EntityToBuffer(src
+ skip
, len
- skip
,
324 sc
= getone::GetUTF8LetterScriptNum(temp
);
325 // printf("#(%02x%02x)=%d ", temp[0], temp[1], sc);
329 tlen
= cld_UniLib::OneCharLen(src
+ skip
);
330 sc
= getone::GetUTF8LetterScriptNum(src
+ skip
);
331 // printf("#(%02x%02x)=%d ", src[skip], src[skip+1], sc);
334 // printf("sc=%d ", sc);
335 if (sc
!= 0) {break;} // Letter found
336 skip
+= tlen
; // Advance
343 #ifdef NEED_ALIGNED_LOADS
344 static const bool kNeedsAlignedLoads
= true;
346 static const bool kNeedsAlignedLoads
= false;
350 // Copy next run of same-script non-tag letters to buffer [NUL terminated]
351 // Buffer has leading space and all text is lowercased
352 bool ScriptScanner::GetOneScriptSpan(getone::LangSpan
* span
) {
353 span
->text
= script_buffer_
;
354 span
->text_bytes
= 0;
355 span
->offset
= next_byte_
- start_byte_
;
356 span
->script
= UNKNOWN_LSCRIPT
;
357 span
->lang
= UNKNOWN_LANGUAGE
;
358 span
->truncated
= false;
360 // printf("GetOneScriptSpan[[ ");
361 // struct timeval script_start, script_mid, script_end;
363 int spanscript
; // The script of this span
364 int sc
= UNKNOWN_LSCRIPT
; // The script of next character
368 script_buffer_
[0] = ' '; // Always a space at front of output
369 script_buffer_
[1] = '\0';
371 int put
= 1; // Start after the initial space
373 // gettimeofday(&script_start, NULL);
374 // Get to the first real non-tag letter or entity that is a letter
375 int skip
= SkipToFrontOfSpan(next_byte_
, byte_length_
, &spanscript
);
377 byte_length_
-= skip
;
378 if (byte_length_
<= 0) {
380 return false; // No more letters to be found
383 // gettimeofday(&script_mid, NULL);
385 // There is at least one letter, so we know the script for this span
386 // printf("{%d} ", spanscript);
387 span
->script
= (UnicodeLScript
)spanscript
;
390 // Go over alternating spans of same-script letters and non-letters,
391 // copying letters to buffer with single spaces for each run of non-letters
392 while (take
< byte_length_
) {
393 // Copy run of letters in same script (&LS | LS)*
394 int letter_count
= 0; // Keep track of word length
395 bool need_break
= false;
396 while (take
< byte_length_
) {
397 // We are at a letter, nonletter, tag, or entity
398 if (IsSpecial(next_byte_
[take
]) && !is_plain_text_
) {
399 // printf("\"%c\" ", next_byte_[take]);
400 if (next_byte_
[take
] == '<') {
404 } else if (next_byte_
[take
] == '>') {
405 // Unexpected end of tag
408 } else if (next_byte_
[take
] == '&') {
409 // Copy entity, no advance
410 EntityToBuffer(next_byte_
+ take
, byte_length_
- take
,
411 script_buffer_
+ put
, &tlen
, &plen
);
412 sc
= getone::GetUTF8LetterScriptNum(script_buffer_
+ put
);
415 // Real letter, safely copy up to 4 bytes, increment by 1..4
416 // Will update by 1..4 bytes at Advance, below
417 tlen
= plen
= cld_UniLib::OneCharLen(next_byte_
+ take
);
418 if (!kNeedsAlignedLoads
&& (take
< (byte_length_
- 3))) {
420 *reinterpret_cast<uint32
*>(script_buffer_
+ put
) =
421 *reinterpret_cast<const uint32
*>(next_byte_
+ take
);
423 // Slow case, happens 1-3 times per input document
424 memcpy(script_buffer_
+ put
, next_byte_
+ take
, plen
);
426 sc
= getone::GetUTF8LetterScriptNum(next_byte_
+ take
);
428 // printf("sc(%c)=%d ", next_byte_[take], sc);
429 // char xtmp[8]; memcpy(xtmp,script_buffer_ + put, plen);
430 // xtmp[plen] = '\0'; printf("'%s'{%d} ", xtmp, sc);
432 // Allow continue across a single letter in a different script:
433 // A B D = three scripts, c = common script, i = inherited script,
434 // - = don't care, ( = take position before the += below
439 // AAA(Bc continue (breaks after B)
448 if ((sc
!= spanscript
) && (sc
!= ULScript_Inherited
)) {
449 // Might need to break this script span
450 if (sc
== ULScript_Common
) {
453 // Look at next following character, ignoring entity as Common
454 int sc2
= getone::GetUTF8LetterScriptNum(next_byte_
+ take
+ tlen
);
455 if ((sc2
!= ULScript_Common
) && (sc2
!= spanscript
)) {
460 if (need_break
) {break;} // Non-letter or letter in wrong script
462 take
+= tlen
; // Advance
463 put
+= plen
; // Advance
465 if (put
>= getone::kMaxScriptBytes
) {
467 span
->truncated
= true;
470 } // End while letters
472 // Do run of non-letters (tag | &NL | NL)*
473 while (take
< byte_length_
) {
474 // Do fast scan to next interesting byte
475 take
+= ScanToLetterOrSpecial(next_byte_
+ take
, byte_length_
- take
);
477 // Check for no more letters/specials
478 if (take
>= byte_length_
) {
483 // We are at a letter, nonletter, tag, or entity
484 if (IsSpecial(next_byte_
[take
]) && !is_plain_text_
) {
485 // printf("\"%c\" ", next_byte_[take]);
486 if (next_byte_
[take
] == '<') {
487 // Begining of tag; skip to end and go around again
488 tlen
= ScanToPossibleLetter(next_byte_
+ take
, byte_length_
- take
);
491 } else if (next_byte_
[take
] == '>') {
492 // Unexpected end of tag; skip it and go around again
493 tlen
= 1; // Over the >
496 } else if (next_byte_
[take
] == '&') {
497 // Expand entity, no advance
498 EntityToBuffer(next_byte_
+ take
, byte_length_
- take
,
499 script_buffer_
+ put
, &tlen
, &plen
);
500 sc
= getone::GetUTF8LetterScriptNum(script_buffer_
+ put
);
504 tlen
= cld_UniLib::OneCharLen(next_byte_
+ take
);
505 sc
= getone::GetUTF8LetterScriptNum(next_byte_
+ take
);
507 // printf("sc[%c]=%d ", next_byte_[take], sc);
508 if (sc
!= 0) {break;} // Letter found
509 take
+= tlen
; // Advance
510 } // End while not-letters
512 script_buffer_
[put
++] = ' ';
514 // We are at a letter again (or eos), after letter* not-letter*
515 if (sc
!= spanscript
) {break;} // Letter in wrong script
516 if (put
>= getone::kMaxScriptBytes
- 8) {
517 // Buffer is almost full
518 span
->truncated
= true;
523 // Update input position
525 byte_length_
-= take
;
527 // Put four more spaces/NUL. Worst case is abcd _ _ _ \0
528 // kMaxScriptBytes | | put
529 script_buffer_
[put
+ 0] = ' ';
530 script_buffer_
[put
+ 1] = ' ';
531 script_buffer_
[put
+ 2] = ' ';
532 script_buffer_
[put
+ 3] = '\0';
534 span
->text_bytes
= put
; // Does not include the last four chars above
536 // printf(" %d]]\n\n", put);
540 // Force Latin, Cyrillic, Greek scripts to be lowercase
541 void ScriptScanner::LowerScriptSpan(getone::LangSpan
* span
) {
542 // On Windows, text is lowercased beforehand, so no need to do anything here.
543 #if !defined(CLD_WINDOWS)
544 // If needed, lowercase all the text. If we do it sooner, might miss
545 // lowercasing an entity such as Á
546 // We only need to do this for Latn and Cyrl scripts
547 if ((span
->script
== ULScript_Latin
) ||
548 (span
->script
== ULScript_Cyrillic
) ||
549 (span
->script
== ULScript_Greek
)) {
550 // Full Unicode lowercase of the entire buffer, including
551 // four pad bytes off the end
552 int consumed
, filled
;
553 UniLib::ToLower(span
->text
, span
->text_bytes
+ 4,
554 script_buffer_lower_
, getone::kMaxScriptLowerBuffer
,
556 span
->text
= script_buffer_lower_
;
557 span
->text_bytes
= filled
- 4;
562 // Copy next run of same-script non-tag letters to buffer [NUL terminated]
563 // Force Latin and Cyrillic scripts to be lowercase
564 bool ScriptScanner::GetOneScriptSpanLower(getone::LangSpan
* span
) {
565 bool ok
= GetOneScriptSpan(span
);
566 LowerScriptSpan(span
);
570 // Gets lscript number for letters; always returns
571 // 0 (common script) for non-letters
572 int getone::GetUTF8LetterScriptNum(const char* src
) {
573 int srclen
= cld_UniLib::OneCharLen(src
);
574 const uint8
* usrc
= reinterpret_cast<const uint8
*>(src
);
575 return UTF8GenericProperty(&utf8propletterscriptnum_obj
, &usrc
, &srclen
);