third_party/cld/encodings/compact_lang_det/getonescriptspan.cc

   1 // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #include "encodings/compact_lang_det/getonescriptspan.h"
   6 #include <stdio.h>
   7 #include <string.h>
   8
   9 #include "base/basictypes.h"
  10 #include "encodings/lang_enc.h"
  11 #include "encodings/compact_lang_det/utf8propjustletter.h"
  12 #include "encodings/compact_lang_det/utf8propletterscriptnum.h"
  13 #include "encodings/compact_lang_det/utf8scannotjustletterspecial.h"
  14
  15 #include "encodings/compact_lang_det/win/cld_basictypes.h"
  16 #include "encodings/compact_lang_det/win/cld_commandlineflags.h"
  17 #include "encodings/compact_lang_det/win/cld_google.h"
  18 #include "encodings/compact_lang_det/win/cld_htmlutils.h"
  19 #include "encodings/compact_lang_det/win/cld_unilib.h"
  20 #include "encodings/compact_lang_det/win/cld_utf8statetable.h"
  21 #include "encodings/compact_lang_det/win/cld_utf8utils.h"
  22
  23 static const Language GRAY_LANG = (Language)254;
  24
  25 static const int kMaxUpToWordBoundary = 50;       // span < this make longer,
  26                                                   // else make shorter
  27 static const int kMaxAdvanceToWordBoundary = 10;  // +/- this many bytes
  28                                                   // to round to word boundary,
  29                                                   // direction above
  30
  31 static const char kSpecialSymbol[256] = {       // true for < > &
  32   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
  33   0,0,0,0,0,0,1,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,1,0,1,0,
  34   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
  35   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
  36
  37   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
  38   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
  39   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
  40   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
  41 };
  42
  43
  44
  45 #define LT 0      // <
  46 #define GT 1      // >
  47 #define EX 2      // !
  48 #define HY 3      // -
  49 #define QU 4      // "
  50 #define AP 5      // '
  51 #define SL 6      // /
  52 #define S_ 7
  53 #define C_ 8
  54 #define R_ 9
  55 #define I_ 10
  56 #define P_ 11
  57 #define T_ 12
  58 #define Y_ 13
  59 #define L_ 14
  60 #define E_ 15
  61 #define CR 16     // <cr> or <lf>
  62 #define NL 17     // non-letter: ASCII whitespace, digit, punctuation
  63 #define PL 18     // possible letter, incl. &
  64 #define xx 19     // <unused>
  65
  66 // Map byte to one of ~20 interesting categories for cheap tag parsing
  67 static const uint8 kCharToSub[256] = {
  68   NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,CR,NL, NL,CR,NL,NL,
  69   NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
  70   NL,EX,QU,NL, NL,NL,PL,AP, NL,NL,NL,NL, NL,HY,NL,SL,
  71   NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, LT,NL,GT,NL,
  72
  73   PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL,
  74   P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL,
  75   PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL,
  76   P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL,
  77
  78   NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
  79   NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
  80   NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
  81   NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
  82
  83   PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
  84   PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
  85   PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
  86   PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
  87 };
  88
  89 #undef LT
  90 #undef GT
  91 #undef EX
  92 #undef HY
  93 #undef QU
  94 #undef AP
  95 #undef SL
  96 #undef S_
  97 #undef C_
  98 #undef R_
  99 #undef I_
 100 #undef P_
 101 #undef T_
 102 #undef Y_
 103 #undef L_
 104 #undef E_
 105 #undef CR
 106 #undef NL
 107 #undef PL
 108 #undef xx
 109
 110
 111 #define OK 0
 112 #define X_ 1
 113
 114 // State machine to do cheap parse of non-letter strings incl. tags
 115 // advances <tag>
 116 //          |    |
 117 // advances <tag> ... </tag>  for <script> <style>
 118 //          |               |
 119 // advances <!-- ... <tag> ... -->
 120 //          |                     |
 121 // advances <tag
 122 //          ||  (0)
 123 // advances <tag <tag2>
 124 //          ||  (0)
 125 static const uint8 kTagParseTbl_0[] = {
 126 // <  >  !  -   "  '  /  S   C  R  I  P   T  Y  L  E  CR NL PL xx
 127    3, 2, 2, 2,  2, 2, 2,OK, OK,OK,OK,OK, OK,OK,OK,OK,  2, 2,OK,X_, // [0] OK
 128   X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, // [1] error
 129    3, 2, 2, 2,  2, 2, 2,OK, OK,OK,OK,OK, OK,OK,OK,OK,  2, 2,OK,X_, // [2] NL*
 130   X_, 2, 4, 9, 10,11, 9,13,  9, 9, 9, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [3] <
 131   X_, 2, 9, 5, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [4] <!
 132   X_, 2, 9, 6, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [5] <!-
 133    6, 6, 6, 7,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6,X_, // [6] <!--.*
 134    6, 6, 6, 8,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6,X_, // [7] <!--.*-
 135    6, 2, 6, 8,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6,X_, // [8] <!--.*--
 136   X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [9] <.*
 137   10,10,10,10,  9,10,10,10, 10,10,10,10, 10,10,10,10, 12,10,10,X_, // [10] <.*"
 138   11,11,11,11, 11, 9,11,11, 11,11,11,11, 11,11,11,11, 12,11,11,X_, // [11] <.*'
 139   X_, 2,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,X_, // [12] <.* no " '
 140
 141 // <  >  !  -   "  '  /  S   C  R  I  P   T  Y  L  E  CR NL PL xx
 142   X_, 2, 9, 9, 10,11, 9, 9, 14, 9, 9, 9, 28, 9, 9, 9,  9, 9, 9,X_, // [13] <S
 143   X_, 2, 9, 9, 10,11, 9, 9,  9,15, 9, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [14] <SC
 144   X_, 2, 9, 9, 10,11, 9, 9,  9, 9,16, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [15] <SCR
 145   X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9,17,  9, 9, 9, 9,  9, 9, 9,X_, // [16] <SCRI
 146   X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9, 9, 18, 9, 9, 9,  9, 9, 9,X_, // [17] <SCRIP
 147   X_,19, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9, 9, 19,19, 9,X_, // [18] <SCRIPT
 148   20,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [19] <SCRIPT .*
 149   19,19,19,19, 19,19,21,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [20] <SCRIPT .*<
 150   19,19,19,19, 19,19,19,22, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [21] <SCRIPT .*</
 151   19,19,19,19, 19,19,19,19, 23,19,19,19, 19,19,19,19, 19,19,19,X_, // [22] <SCRIPT .*</S
 152   19,19,19,19, 19,19,19,19, 19,24,19,19, 19,19,19,19, 19,19,19,X_, // [23] <SCRIPT .*</SC
 153   19,19,19,19, 19,19,19,19, 19,19,25,19, 19,19,19,19, 19,19,19,X_, // [24] <SCRIPT .*</SCR
 154   19,19,19,19, 19,19,19,19, 19,19,19,26, 19,19,19,19, 19,19,19,X_, // [25] <SCRIPT .*</SCRI
 155   19,19,19,19, 19,19,19,19, 19,19,19,19, 27,19,19,19, 19,19,19,X_, // [26] <SCRIPT .*</SCRIP
 156   19, 2,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [27] <SCRIPT .*</SCRIPT
 157
 158 // <  >  !  -   "  '  /  S   C  R  I  P   T  Y  L  E  CR NL PL xx
 159   X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9,29, 9, 9,  9, 9, 9,X_, // [28] <ST
 160   X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9, 9,30, 9,  9, 9, 9,X_, // [29] <STY
 161   X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9,31,  9, 9, 9,X_, // [30] <STYL
 162   X_,32, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9, 9, 32,32, 9,X_, // [31] <STYLE
 163   33,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [32] <STYLE .*
 164   32,32,32,32, 32,32,34,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [33] <STYLE .*<
 165   32,32,32,32, 32,32,32,35, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [34] <STYLE .*</
 166   32,32,32,32, 32,32,32,32, 32,32,32,32, 36,32,32,32, 32,32,32,X_, // [35] <STYLE .*</S
 167   32,32,32,32, 32,32,32,32, 32,32,32,32, 32,37,32,32, 32,32,32,X_, // [36] <STYLE .*</ST
 168   32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,38,32, 32,32,32,X_, // [37] <STYLE .*</STY
 169   32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,39, 32,32,32,X_, // [38] <STYLE .*</STYL
 170   32, 2,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [39] <STYLE .*</STYLE
 171 };
 172
 173 #undef OK
 174 #undef X_
 175
 176
 177 /*
 178 // Convert GetTimeOfDay output to 64-bit usec
 179 static inline uint64 Microseconds(const struct timeval& t) {
 180   // The SumReducer uses uint64, so convert to (uint64) microseconds,
 181   // not (double) seconds.
 182   return t.tv_sec * 1000000ULL + t.tv_usec;
 183 }
 184 */
 185
 186
 187 // Returns true if character is < > or &
 188 bool inline IsSpecial(char c) {
 189   if ((c & 0xe0) == 0x20) {
 190     return kSpecialSymbol[static_cast<uint8>(c)];
 191   }
 192   return false;
 193 }
 194
 195 // Quick Skip to next letter or < > & or to end of string (eos)
 196 // Always return is_letter for eos
 197 int ScanToLetterOrSpecial(const char* src, int len) {
 198   int bytes_consumed;
 199   cld::UTF8GenericScan(&utf8scannotjustletterspecial_obj, src, len,
 200                        &bytes_consumed);
 201   return bytes_consumed;
 202 }
 203
 204
 205
 206 // src points to non-letter, such as tag-opening '<'
 207 // Return length from here to next possible letter
 208 // On eos or another < before >, return 1
 209 // advances <tag>
 210 //          |    |
 211 // advances <tag> ... </tag>  for <script> <style>
 212 //          |               |
 213 // advances <!-- ... <tag> ... -->
 214 //          |                     |
 215 // advances <tag
 216 //          ||  (1)
 217 // advances <tag <tag2>
 218 //          ||  (1)
 219 int ScanToPossibleLetter(const char* isrc, int len) {
 220   const uint8* src = reinterpret_cast<const uint8*>(isrc);
 221   const uint8* srclimit = src + len;
 222   const uint8* tagParseTbl = kTagParseTbl_0;
 223   int e = 0;
 224   while (src < srclimit) {
 225     e = tagParseTbl[kCharToSub[*src++]];
 226     if ((e & ~1) == 0) {
 227       // We overshot by one byte
 228       --src;
 229       break;
 230     }
 231     tagParseTbl = &kTagParseTbl_0[e * 20];
 232   }
 233
 234   if (src >= srclimit) {
 235     // We fell off the end of the text.
 236     // It looks like the most common case for this is a truncated file, not
 237     // mismatched angle brackets. So we pretend that the last char was '>'
 238     return len;
 239   }
 240
 241   // OK to be in state 0 or state 2 at exit
 242   if ((e != 0) && (e != 2)) {
 243     // Error, '<' followed by '<'
 244     // We want to back up to first <, then advance by one byte past it
 245     int offset = src - reinterpret_cast<const uint8*>(isrc);
 246     // printf("ScanToPossibleLetter error at %d[%d] in '%s'\n",offset,e,isrc);
 247
 248     // Backscan to first '<' and return enough length to just get past it
 249     --offset;   // back up over the second '<', which caused us to stop
 250     while ((0 < offset) && (isrc[offset] != '<')) {
 251       // Find the first '<', which is unmatched
 252       --offset;
 253     }
 254     // skip to just beyond first '<'
 255     // printf("  returning %d\n", offset + 1);
 256     return offset + 1;
 257   }
 258
 259   return src - reinterpret_cast<const uint8*>(isrc);
 260 }
 261
 262
 263
 264 ScriptScanner::ScriptScanner(const char* buffer,
 265                              int buffer_length,
 266                              bool is_plain_text)
 267   : start_byte_(buffer),
 268   next_byte_(buffer),
 269   next_byte_limit_(buffer + buffer_length),
 270   byte_length_(buffer_length),
 271   is_plain_text_(is_plain_text) {
 272     script_buffer_ = new char[getone::kMaxScriptBuffer];
 273     script_buffer_lower_ = new char[getone::kMaxScriptLowerBuffer];
 274 }
 275
 276 ScriptScanner::~ScriptScanner() {
 277   delete[] script_buffer_;
 278   delete[] script_buffer_lower_;
 279 }
 280
 281
 282
 283
 284 // Get to the first real non-tag letter or entity that is a letter
 285 // Sets script of that letter
 286 // Return len if no more letters
 287 int ScriptScanner::SkipToFrontOfSpan(const char* src, int len, int* script) {
 288   int sc = UNKNOWN_LSCRIPT;
 289   int skip = 0;
 290   int tlen, plen;
 291
 292   // Do run of non-letters (tag | &NL | NL)*
 293   while (skip < len) {
 294     // Do fast scan to next interesting byte
 295     // int oldskip = skip;
 296     skip += ScanToLetterOrSpecial(src + skip, len - skip);
 297     // TEMP
 298     // printf("ScanToLetterOrSpecial[%d] 0x%02x => [%d] 0x%02x\n",
 299     //       oldskip, src[oldskip], skip, src[skip]);
 300
 301     // Check for no more letters/specials
 302     if (skip >= len) {
 303       // All done
 304       return len;
 305     }
 306
 307     // We are at a letter, nonletter, tag, or entity
 308     if (IsSpecial(src[skip]) && !is_plain_text_) {
 309       if (src[skip] == '<') {
 310         // Begining of tag; skip to end and go around again
 311         tlen = ScanToPossibleLetter(src + skip, len - skip);
 312         sc = 0;
 313         // printf("<...> ");
 314       } else if (src[skip] == '>') {
 315         // Unexpected end of tag; skip it and go around again
 316         tlen = 1;         // Over the >
 317         sc = 0;
 318         // printf("..> ");
 319       } else if (src[skip] == '&') {
 320         // Expand entity, no advance
 321         char temp[4];
 322         EntityToBuffer(src + skip, len - skip,
 323                        temp, &tlen, &plen);
 324         sc = getone::GetUTF8LetterScriptNum(temp);
 325         // printf("#(%02x%02x)=%d ", temp[0], temp[1], sc);
 326       }
 327     } else {
 328       // Update 1..4 bytes
 329       tlen = cld_UniLib::OneCharLen(src + skip);
 330       sc = getone::GetUTF8LetterScriptNum(src + skip);
 331       // printf("#(%02x%02x)=%d ", src[skip], src[skip+1], sc);
 332     }
 333     // TEMP
 334     // printf("sc=%d ", sc);
 335     if (sc != 0) {break;}           // Letter found
 336     skip += tlen;                   // Advance
 337   }
 338
 339   *script = sc;
 340   return skip;
 341 }
 342
 343 #ifdef NEED_ALIGNED_LOADS
 344 static const bool kNeedsAlignedLoads = true;
 345 #else
 346 static const bool kNeedsAlignedLoads = false;
 347 #endif
 348
 349
 350 // Copy next run of same-script non-tag letters to buffer [NUL terminated]
 351 // Buffer has leading space and all text is lowercased
 352 bool ScriptScanner::GetOneScriptSpan(getone::LangSpan* span) {
 353   span->text = script_buffer_;
 354   span->text_bytes = 0;
 355   span->offset = next_byte_ - start_byte_;
 356   span->script = UNKNOWN_LSCRIPT;
 357   span->lang = UNKNOWN_LANGUAGE;
 358   span->truncated = false;
 359
 360   // printf("GetOneScriptSpan[[ ");
 361   // struct timeval script_start, script_mid, script_end;
 362
 363   int spanscript;           // The script of this span
 364   int sc = UNKNOWN_LSCRIPT;  // The script of next character
 365   int tlen, plen;
 366
 367
 368   script_buffer_[0] = ' ';  // Always a space at front of output
 369   script_buffer_[1] = '\0';
 370   int take = 0;
 371   int put = 1;              // Start after the initial space
 372
 373   // gettimeofday(&script_start, NULL);
 374   // Get to the first real non-tag letter or entity that is a letter
 375   int skip = SkipToFrontOfSpan(next_byte_, byte_length_, &spanscript);
 376   next_byte_ += skip;
 377   byte_length_ -= skip;
 378   if (byte_length_ <= 0) {
 379     // printf("]]\n");
 380     return false;               // No more letters to be found
 381   }
 382
 383   // gettimeofday(&script_mid, NULL);
 384
 385   // There is at least one letter, so we know the script for this span
 386   // printf("{%d} ", spanscript);
 387   span->script = (UnicodeLScript)spanscript;
 388
 389
 390   // Go over alternating spans of same-script letters and non-letters,
 391   // copying letters to buffer with single spaces for each run of non-letters
 392   while (take < byte_length_) {
 393     // Copy run of letters in same script (&LS | LS)*
 394     int letter_count = 0;              // Keep track of word length
 395     bool need_break = false;
 396     while (take < byte_length_) {
 397       // We are at a letter, nonletter, tag, or entity
 398       if (IsSpecial(next_byte_[take]) && !is_plain_text_) {
 399         // printf("\"%c\" ", next_byte_[take]);
 400         if (next_byte_[take] == '<') {
 401           // Begining of tag
 402           sc = 0;
 403           break;
 404         } else if (next_byte_[take] == '>') {
 405           // Unexpected end of tag
 406           sc = 0;
 407           break;
 408         } else if (next_byte_[take] == '&') {
 409           // Copy entity, no advance
 410           EntityToBuffer(next_byte_ + take, byte_length_ - take,
 411                          script_buffer_ + put, &tlen, &plen);
 412           sc = getone::GetUTF8LetterScriptNum(script_buffer_ + put);
 413         }
 414       } else {
 415         // Real letter, safely copy up to 4 bytes, increment by 1..4
 416         // Will update by 1..4 bytes at Advance, below
 417         tlen = plen = cld_UniLib::OneCharLen(next_byte_ + take);
 418         if (!kNeedsAlignedLoads && (take < (byte_length_ - 3))) {
 419           // Fast case
 420           *reinterpret_cast<uint32*>(script_buffer_ + put) =
 421             *reinterpret_cast<const uint32*>(next_byte_ + take);
 422         } else {
 423           // Slow case, happens 1-3 times per input document
 424           memcpy(script_buffer_ + put, next_byte_ + take, plen);
 425         }
 426         sc = getone::GetUTF8LetterScriptNum(next_byte_ + take);
 427       }
 428       // printf("sc(%c)=%d ", next_byte_[take], sc);
 429       // char xtmp[8]; memcpy(xtmp,script_buffer_ + put, plen);
 430       // xtmp[plen] = '\0'; printf("'%s'{%d} ", xtmp, sc);
 431
 432       // Allow continue across a single letter in a different script:
 433       // A B D = three scripts, c = common script, i = inherited script,
 434       // - = don't care, ( = take position before the += below
 435       //  AAA(A-    continue
 436       //
 437       //  AAA(BA    continue
 438       //  AAA(BB    break
 439       //  AAA(Bc    continue (breaks after B)
 440       //  AAA(BD    break
 441       //  AAA(Bi    break
 442       //
 443       //  AAA(c-    break
 444       //
 445       //  AAA(i-    continue
 446       //
 447
 448       if ((sc != spanscript) && (sc != ULScript_Inherited)) {
 449         // Might need to break this script span
 450         if (sc == ULScript_Common) {
 451           need_break = true;
 452         } else {
 453           // Look at next following character, ignoring entity as Common
 454           int sc2 = getone::GetUTF8LetterScriptNum(next_byte_ + take + tlen);
 455           if ((sc2 != ULScript_Common) && (sc2 != spanscript)) {
 456             need_break = true;
 457           }
 458         }
 459       }
 460       if (need_break) {break;}  // Non-letter or letter in wrong script
 461
 462       take += tlen;                   // Advance
 463       put += plen;                    // Advance
 464       ++letter_count;
 465       if (put >= getone::kMaxScriptBytes) {
 466         // Buffer is full
 467         span->truncated = true;
 468         break;
 469       }
 470     }     // End while letters
 471
 472     // Do run of non-letters (tag | &NL | NL)*
 473     while (take < byte_length_) {
 474       // Do fast scan to next interesting byte
 475       take += ScanToLetterOrSpecial(next_byte_ + take, byte_length_ - take);
 476
 477       // Check for no more letters/specials
 478       if (take >= byte_length_) {
 479         take = byte_length_;
 480         break;
 481       }
 482
 483       // We are at a letter, nonletter, tag, or entity
 484       if (IsSpecial(next_byte_[take]) && !is_plain_text_) {
 485         // printf("\"%c\" ", next_byte_[take]);
 486         if (next_byte_[take] == '<') {
 487           // Begining of tag; skip to end and go around again
 488           tlen = ScanToPossibleLetter(next_byte_ + take, byte_length_ - take);
 489           sc = 0;
 490           // printf("<...> ");
 491         } else if (next_byte_[take] == '>') {
 492           // Unexpected end of tag; skip it and go around again
 493           tlen = 1;         // Over the >
 494           sc = 0;
 495           // printf("..> ");
 496         } else if (next_byte_[take] == '&') {
 497           // Expand entity, no advance
 498           EntityToBuffer(next_byte_ + take, byte_length_ - take,
 499                          script_buffer_ + put, &tlen, &plen);
 500           sc = getone::GetUTF8LetterScriptNum(script_buffer_ + put);
 501         }
 502       } else {
 503         // Update 1..4
 504         tlen = cld_UniLib::OneCharLen(next_byte_ + take);
 505         sc = getone::GetUTF8LetterScriptNum(next_byte_ + take);
 506       }
 507       // printf("sc[%c]=%d ", next_byte_[take], sc);
 508       if (sc != 0) {break;}           // Letter found
 509       take += tlen;                   // Advance
 510     }     // End while not-letters
 511
 512     script_buffer_[put++] = ' ';
 513
 514     // We are at a letter again (or eos), after letter* not-letter*
 515     if (sc != spanscript) {break;}            // Letter in wrong script
 516     if (put >= getone::kMaxScriptBytes - 8) {
 517       // Buffer is almost full
 518       span->truncated = true;
 519       break;
 520     }
 521   }
 522
 523   // Update input position
 524   next_byte_ += take;
 525   byte_length_ -= take;
 526
 527   // Put four more spaces/NUL. Worst case is abcd _ _ _ \0
 528   //                          kMaxScriptBytes |   | put
 529   script_buffer_[put + 0] = ' ';
 530   script_buffer_[put + 1] = ' ';
 531   script_buffer_[put + 2] = ' ';
 532   script_buffer_[put + 3] = '\0';
 533
 534   span->text_bytes = put;       // Does not include the last four chars above
 535
 536   // printf(" %d]]\n\n", put);
 537   return true;
 538 }
 539
 540 // Force Latin, Cyrillic, Greek scripts to be lowercase
 541 void ScriptScanner::LowerScriptSpan(getone::LangSpan* span) {
 542   // On Windows, text is lowercased beforehand, so no need to do anything here.
 543 #if !defined(CLD_WINDOWS)
 544   // If needed, lowercase all the text. If we do it sooner, might miss
 545   // lowercasing an entity such as &Aacute;
 546   // We only need to do this for Latn and Cyrl scripts
 547   if ((span->script == ULScript_Latin) ||
 548       (span->script == ULScript_Cyrillic) ||
 549       (span->script == ULScript_Greek)) {
 550     // Full Unicode lowercase of the entire buffer, including
 551     // four pad bytes off the end
 552     int consumed, filled;
 553     UniLib::ToLower(span->text, span->text_bytes + 4,
 554                     script_buffer_lower_, getone::kMaxScriptLowerBuffer,
 555                     &consumed, &filled);
 556     span->text = script_buffer_lower_;
 557     span->text_bytes = filled - 4;
 558   }
 559 #endif
 560 }
 561
 562 // Copy next run of same-script non-tag letters to buffer [NUL terminated]
 563 // Force Latin and Cyrillic scripts to be lowercase
 564 bool ScriptScanner::GetOneScriptSpanLower(getone::LangSpan* span) {
 565   bool ok = GetOneScriptSpan(span);
 566   LowerScriptSpan(span);
 567   return ok;
 568 }
 569
 570 // Gets lscript number for letters; always returns
 571 //   0 (common script) for non-letters
 572 int getone::GetUTF8LetterScriptNum(const char* src) {
 573   int srclen = cld_UniLib::OneCharLen(src);
 574   const uint8* usrc = reinterpret_cast<const uint8*>(src);
 575   return UTF8GenericProperty(&utf8propletterscriptnum_obj, &usrc, &srclen);
 576 }