Merge Chromium + Blink git repositories
[chromium-blink-merge.git] / third_party / cld / encodings / compact_lang_det / compact_lang_det_impl.cc
blobc8a4a6baccd1672f48a44dfe5c983638c38a4f73
1 // Copyright (c) 2009 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include <stdio.h>
6 #include <string.h>
7 //#include <sys/time.h> // for gettimeofday
8 #include <string>
10 #include "encodings/lang_enc.h"
12 #include "encodings/compact_lang_det/compact_lang_det.h"
13 #include "encodings/compact_lang_det/compact_lang_det_impl.h"
14 #include "encodings/compact_lang_det/getonescriptspan.h"
15 #include "encodings/compact_lang_det/letterscript_enum.h"
16 #include "encodings/compact_lang_det/tote.h"
17 #include "encodings/compact_lang_det/utf8propjustletter.h"
18 #include "encodings/compact_lang_det/utf8propletterscriptnum.h"
19 #include "encodings/compact_lang_det/utf8scannotjustletterspecial.h"
21 #include "encodings/compact_lang_det/cldutil_dbg.h"
23 #include "encodings/compact_lang_det/win/cld_basictypes.h"
24 #include "encodings/compact_lang_det/win/cld_commandlineflags.h"
25 #include "encodings/compact_lang_det/win/cld_google.h"
26 #include "encodings/compact_lang_det/win/cld_utf8statetable.h"
28 // Linker supplies the right tables
29 extern const UTF8PropObj compact_lang_det_generated_ctjkvz_b1_obj;
30 extern const cld::CLDTableSummary kCjkBiTable_obj;
31 extern const cld::CLDTableSummary kQuadTable_obj;
32 extern const cld::CLDTableSummary kLongWord8Table_obj;
34 DEFINE_bool(cld_html, false, "Print language spans in HTML on stderr");
35 DEFINE_bool(cld_forcewords, false, "Score all words, in addition to quads");
37 DEFINE_bool(cld_showme, false, "Put squeeze/repeat points into HTML text");
38 DEFINE_bool(cld_echotext, false, "Print each scriptspan to stderr");
39 DEFINE_int32(cld_textlimit, 160, "Examine only initial n KB of actual text");
40 // 20 quadgrams is about 80 bytes or about 12 words in real text
41 DEFINE_int32(cld_smoothwidth, 20, "Smoothing window width in quadgrams");
44 static const int kLangHintInitial = 12; // Boost language by N initially
45 static const int kLangHintBoost = 12; // Boost language by N/16 per quadgram
47 static const int kShortSpanThresh = 32; // Bytes
48 static const int kMaxSecondChanceLen = 1024; // Look at first 1K of short spans
50 static const int kCheapSqueezeTestThresh = 4096; // Only look for squeezing
51 // after this many text bytes
52 static const int kCheapSqueezeTestLen = 256; // Bytes to test to trigger sqz
53 static const int kSpacesTriggerPercent = 25; // Trigger sqz if >=25% spaces
54 static const int kPredictTriggerPercent = 67; // Trigger sqz if >=67% predicted
56 static const int kChunksizeDefault = 48; // Squeeze 48-byte chunks
57 static const int kSpacesThreshPercent = 25; // Squeeze if >=25% spaces
58 static const int kPredictThreshPercent = 40; // Squeeze if >=40% predicted
60 static const int kMaxSpaceScan = 32; // Bytes
62 static const int kGoodLang1Percent = 70;
63 static const int kGoodLang1and2Percent = 93;
64 static const int kShortTextThresh = 256; // Bytes
66 static const int kMinChunkSizeQuads = 4; // Chunk is at least four quads
67 static const int kMaxChunkSizeQuads = 1024; // Chunk is at most 1K quads
69 static const int kDefaultWordSpan = 256; // Scan at least this many initial
70 // bytes with word scoring
71 static const int kReallyBigWordSpan = 9999999; // Forces word scoring all text
73 static const int kMinReliableSeq = 50; // Record in seq if >= 50% reliable
75 static const int kPredictionTableSize = 4096; // Must be exactly 4096 for
76 // cheap compressor
79 // Generated by dsites 2008.07.07 from 10% of Base
82 // Three packed language probs, subscripted by Encoding
83 static const uint32 kEncodingHintProbs[] = {
84 0x00000000, // ASCII
85 0x18120cd5, // Latin2 POLISH.11 CZECH.5 HUNGARIAN.3
86 0x1d3a4bc9, // Latin3 AZERBAIJANI.10 BASQUE.3 CROATIAN.1
87 0x030819d4, // Latin4 ESTONIAN.11 ITALIAN.4 DUTCH.2
88 0x00000000, // ISO-8859-5
89 0x00003742, // Arabic ARABIC.12
90 0x00000000, // Greek
91 0x00000742, // Hebrew HEBREW.12
92 0x00002242, // Latin5 TURKISH.12
93 0x060419c9, // Latin6 ESTONIAN.10 FINNISH.3 GERMAN.1
94 0x00000942, // EUC-JP Japanese.12
95 0x00000942, // SJS Japanese.12
96 0x00000942, // JIS Japanese.12
97 0x00004642, // BIG5 ChineseT.12
98 0x00001142, // GB Chinese.12
99 0x46295fcd, // EUC-CN UIGHUR.10 MALAY.6 ChineseT.5
100 0x00000a42, // KSC Korean.12
101 0x00000000, // Unicode
102 0x03104674, // EUC ChineseT.9 SWEDISH.8 DUTCH.3
103 0x00000000, // CNS
104 0x0f1146c3, // BIG5-CP950 ChineseT.9 Chinese.5 SPANISH.4
105 0x00000942, // CP932 Japanese.12
106 0x00000000, // UTF8
107 0x00000000, // Unknown
108 0x00000000, // ASCII-7-bit
109 0x00000000, // KOI8R
110 0x00000000, // CP1251
111 0x00000000, // CP1252
112 0x00000000, // KOI8U
113 0x451d12cd, // CP1250 CZECH.10 CROATIAN.6 SLOVAK.5
114 0x0d06052a, // ISO-8859-15 FRENCH.9 GERMAN.8 PORTUGUESE.7
115 0x00002242, // CP1254 TURKISH.12
116 0x191516be, // CP1257 LITHUANIAN.8 LATVIAN.7 ESTONIAN.7
117 0x08003642, // ISO-8859-11 THAI.12 ITALIAN.1
118 0x00000000, // CP874
119 0x00003742, // CP1256 ARABIC.12
120 0x00000742, // CP1255 HEBREW.12
121 0x00000000, // ISO-8859-8-I
122 0x00000000, // VISUAL
123 0x00000000, // CP852
124 0x39001242, // CSN_369103 CZECH.12 ESPERANTO.1
125 0x00000000, // CP1253
126 0x00000000, // CP866
127 0x2e001944, // ISO-8859-13 ESTONIAN.12 ALBANIAN.3
128 0x08090a74, // ISO-2022-KR Korean.9 Japanese.8 ITALIAN.3
129 0x00001142, // GBK Chinese.12
130 0x4600113d, // GB18030 Chinese.11 ChineseT.7
131 0x00004642, // BIG5_HKSCS ChineseT.12
132 0x00000000, // ISO_2022_CN
133 0x00000000, // TSCII
134 0x00000000, // TAM
135 0x00000000, // TAB
136 0x00000000, // JAGRAN
137 0x00000000, // MACINTOSH
138 0x00000000, // UTF7
139 0x00000000, // BHASKAR
140 0x00000000, // HTCHANAKYA
141 0x090646ca, // UTF-16BE ChineseT.10 GERMAN.4 Japanese.2
142 0x00000000, // UTF-16LE
143 0x00000000, // UTF-32BE
144 0x00000000, // UTF-32LE
145 0x00000000, // X-BINARYENC
146 0x06001142, // HZ-GB-2312 Chinese.12 GERMAN.1
147 0x461109c2, // X-UTF8UTF8 Japanese.9 Chinese.5 ChineseT.3
148 0x00000000, // X-TAM-ELANGO
149 0x00000000, // X-TAM-LTTMBARANI
150 0x00000000, // X-TAM-SHREE
151 0x00000000, // X-TAM-TBOOMIS
152 0x00000000, // X-TAM-TMNEWS
153 0x00000000, // X-TAM-WEBTAMIL
154 0x00000000, // X-KDDI-Shift_JIS
155 0x00000000, // X-DoCoMo-Shift_JIS
156 0x00000000, // X-SoftBank-Shift_JIS
157 0x00000000, // X-KDDI-ISO-2022-JP
158 0x00000000, // X-SoftBank-ISO-2022-JP
161 COMPILE_ASSERT(arraysize(kEncodingHintProbs) == NUM_ENCODINGS,
162 kEncodingHintProbs_has_incorrect_size);
165 // Generated by dsites 2008.07.07 from 10% of Base
168 // Three packed language probs, subscripted by (anchor) language
169 static const uint32 kLanguageHintProbs[] = {
170 0x00000000, // ENGLISH
171 0x00000242, // DANISH DANISH.12
172 0x00000342, // DUTCH DUTCH.12
173 0x00000442, // FINNISH FINNISH.12
174 0x00000542, // FRENCH FRENCH.12
175 0x00000642, // GERMAN GERMAN.12
176 0x00000742, // HEBREW HEBREW.12
177 0x00000842, // ITALIAN ITALIAN.12
178 0x00000942, // Japanese Japanese.12
179 0x00000a42, // Korean Korean.12
180 0x51000b43, // NORWEGIAN NORWEGIAN.12 NORWEGIAN_N.2
181 0x00000c42, // POLISH POLISH.12
182 0x00000d42, // PORTUGUESE PORTUGUESE.12
183 0x00000000, // RUSSIAN
184 0x00000f42, // SPANISH SPANISH.12
185 0x00001042, // SWEDISH SWEDISH.12
186 0x00001142, // Chinese Chinese.12
187 0x00001242, // CZECH CZECH.12
188 0x00000000, // GREEK
189 0x47001442, // ICELANDIC ICELANDIC.12 FAROESE.1
190 0x00001542, // LATVIAN LATVIAN.12
191 0x00001642, // LITHUANIAN LITHUANIAN.12
192 0x00001742, // ROMANIAN ROMANIAN.12
193 0x00001842, // HUNGARIAN HUNGARIAN.12
194 0x00001942, // ESTONIAN ESTONIAN.12
195 0x00000000, // TG_UNKNOWN_LANGUAGE
196 0x00000000, // Unknown
197 0x00001c42, // BULGARIAN BULGARIAN.12
198 0x00001d42, // CROATIAN CROATIAN.12
199 0x1e001d46, // SERBIAN CROATIAN.12 SERBIAN.5
200 0x00000000, // IRISH
201 0x0f00203d, // GALICIAN GALICIAN.11 SPANISH.7
202 0x5e00213a, // TAGALOG TAGALOG.11 SOMALI.4
203 0x00002242, // TURKISH TURKISH.12
204 0x00002342, // UKRAINIAN UKRAINIAN.12
205 0x00000000, // HINDI
206 0x1c1e25d4, // MACEDONIAN MACEDONIAN.11 SERBIAN.4 BULGARIAN.2
207 0x00002642, // BENGALI BENGALI.12
208 0x00002742, // INDONESIAN INDONESIAN.12
209 0x00000000, // LATIN
210 0x2700293c, // MALAY MALAY.11 INDONESIAN.6
211 0x00000000, // MALAYALAM
212 0x00000000, // WELSH
213 0x00000000, // NEPALI
214 0x00000000, // TELUGU
215 0x00002e42, // ALBANIAN ALBANIAN.12
216 0x00000000, // TAMIL
217 0x00003042, // BELARUSIAN BELARUSIAN.12
218 0x00000000, // JAVANESE
219 0x00000000, // OCCITAN
220 0x375f3330, // URDU URDU.10 UIGHUR.7 ARABIC.4
221 0x41003436, // BIHARI BIHARI.10 MARATHI.10
222 0x00000000, // GUJARATI
223 0x0a4636b2, // THAI THAI.7 ChineseT.3 Korean.2
224 0x00003742, // ARABIC ARABIC.12
225 0x00003842, // CATALAN CATALAN.12
226 0x00003942, // ESPERANTO ESPERANTO.12
227 0x00003a42, // BASQUE BASQUE.12
228 0x00000000, // INTERLINGUA
229 0x00000000, // KANNADA
230 0x05060cca, // PUNJABI POLISH.10 GERMAN.4 FRENCH.2
231 0x00000000, // SCOTS_GAELIC
232 0x00003f42, // SWAHILI SWAHILI.12
233 0x00004042, // SLOVENIAN SLOVENIAN.12
234 0x00004142, // MARATHI MARATHI.12
235 0x00004242, // MALTESE MALTESE.12
236 0x00004342, // VIETNAMESE VIETNAMESE.12
237 0x00000000, // FRISIAN
238 0x12004543, // SLOVAK SLOVAK.12 CZECH.2
239 0x00004642, // ChineseT ChineseT.12
240 0x00000000, // FAROESE
241 0x00000000, // SUNDANESE
242 0x79004944, // UZBEK UZBEK.12 TAJIK.3
243 0x4d004a46, // AMHARIC AMHARIC.12 TIGRINYA.5
244 0x00004b42, // AZERBAIJANI AZERBAIJANI.12
245 0x00000000, // GEORGIAN
246 0x00000000, // TIGRINYA
247 0x00004e42, // PERSIAN PERSIAN.12
248 0x00000000, // BOSNIAN
249 0x00000000, // SINHALESE
250 0x00000000, // NORWEGIAN_N
251 0x00000000, // PORTUGUESE_P
252 0x00000000, // PORTUGUESE_B
253 0x00000000, // XHOSA
254 0x00000000, // ZULU
255 0x00000000, // GUARANI
256 0x00000000, // SESOTHO
257 0x00000000, // TURKMEN
258 0x7a005933, // KYRGYZ KYRGYZ.10 TATAR.7
259 0x00000000, // BRETON
260 0x00000000, // TWI
261 0x00000000, // YIDDISH
262 0x00000000, // SERBO_CROATIAN
263 0x00000000, // SOMALI
264 0x00005f42, // UIGHUR UIGHUR.12
265 0x00006042, // KURDISH KURDISH.12
266 0x00006142, // MONGOLIAN MONGOLIAN.12
267 0x051130c9, // ARMENIAN BELARUSIAN.10 Chinese.3 FRENCH.1
268 0x020f0521, // LAOTHIAN FRENCH.8 SPANISH.7 DANISH.6
269 0x64004e35, // SINDHI PERSIAN.10 SINDHI.9
270 0x00000000, // RHAETO_ROMANCE
271 0x00006642, // AFRIKAANS AFRIKAANS.12
272 0x00000000, // LUXEMBOURGISH
273 0x00006842, // BURMESE BURMESE.12
274 0x00002242, // KHMER TURKISH.12
275 0x88006a3c, // TIBETAN TIBETAN.11 DZONGKHA.6
276 0x00000000, // DHIVEHI
277 0x00000000, // CHEROKEE
278 0x00000000, // SYRIAC
279 0x00000000, // LIMBU
280 0x00000000, // ORIYA
281 0x00000000, // ASSAMESE
282 0x00000000, // CORSICAN
283 0x00000000, // INTERLINGUE
284 0x00007342, // KAZAKH KAZAKH.12
285 0x00000000, // LINGALA
286 0x00000000, // MOLDAVIAN
287 0x5f007645, // PASHTO PASHTO.12 UIGHUR.4
288 0x00000000, // QUECHUA
289 0x00000000, // SHONA
290 0x00007942, // TAJIK TAJIK.12
291 0x00000000, // TATAR
292 0x00000000, // TONGA
293 0x00000000, // YORUBA
294 0x00000000, // CREOLES_AND_PIDGINS_ENGLISH_BASED
295 0x00000000, // CREOLES_AND_PIDGINS_FRENCH_BASED
296 0x00000000, // CREOLES_AND_PIDGINS_PORTUGUESE_BASED
297 0x00000000, // CREOLES_AND_PIDGINS_OTHER
298 0x00000000, // MAORI
299 0x00000000, // WOLOF
300 0x00000000, // ABKHAZIAN
301 0x00000000, // AFAR
302 0x00000000, // AYMARA
303 0x00000000, // BASHKIR
304 0x00000000, // BISLAMA
305 0x00000000, // DZONGKHA
306 0x00000000, // FIJIAN
307 0x00000000, // GREENLANDIC
308 0x00000000, // HAUSA
309 0x00000000, // HAITIAN_CREOLE
310 0x00000000, // INUPIAK
311 0x00000542, // INUKTITUT FRENCH.12
312 0x00000000, // KASHMIRI
313 0x00000000, // KINYARWANDA
314 0x00000000, // MALAGASY
315 0x00000000, // NAURU
316 0x00000000, // OROMO
317 0x00000000, // RUNDI
318 0x00000000, // SAMOAN
319 0x00000000, // SANGO
320 0x344197d3, // SANSKRIT SANSKRIT.11 MARATHI.4 BIHARI.1
321 0x00000000, // SISWANT
322 0x00000000, // TSONGA
323 0x00000000, // TSWANA
324 0x00000000, // VOLAPUK
325 0x00000000, // ZHUANG
326 0x00000000, // KHASI
327 0x00000000, // SCOTS
328 0x00000000, // GANDA
329 0x00000000, // MANX
330 0x00000000, // MONTENEGRIN
331 // Add new language hints just before here (just use 0x00000000)
334 COMPILE_ASSERT(arraysize(kLanguageHintProbs) == NUM_LANGUAGES,
335 kLanguageHintProbs_has_incorrect_size);
338 // Generated by dsites 2008.07.07 from 10% of Base
341 typedef struct {
342 char key[4];
343 uint32 probs;
344 } HintEntry;
347 // Massaged TLD, followed by three packed language probs
348 // Hand-removed 4 items dsites 2008.07.15
349 static const int kTLDHintProbsSize = 201;
350 static const HintEntry kTLDHintProbs[kTLDHintProbsSize] = { // MaxRange 12
351 {{0x61,0x63,0x5f,0x5f}, 0x0a000945}, // ac__ Japanese.12 Korean.4
352 {{0x61,0x64,0x5f,0x5f}, 0x00003842}, // ad__ CATALAN.12
353 {{0x61,0x65,0x5f,0x5f}, 0x00003742}, // ae__ ARABIC.12
354 {{0x61,0x66,0x5f,0x5f}, 0x4e00763d}, // af__ PASHTO.11 PERSIAN.7
355 {{0x61,0x67,0x5f,0x5f}, 0x09000643}, // ag__ GERMAN.12 Japanese.2
356 {{0x61,0x69,0x5f,0x5f}, 0x0c180938}, // ai__ Japanese.11 HUNGARIAN.7 POLISH.2
357 {{0x61,0x6c,0x5f,0x5f}, 0x00002e42}, // al__ ALBANIAN.12
358 {{0x61,0x6e,0x5f,0x5f}, 0x6e00033d}, // an__ DUTCH.11 LIMBU.7
359 {{0x61,0x6f,0x5f,0x5f}, 0x05000d42}, // ao__ PORTUGUESE.12 FRENCH.1
360 {{0x61,0x71,0x5f,0x5f}, 0x05000f29}, // aq__ SPANISH.9 FRENCH.6
361 {{0x61,0x72,0x5f,0x5f}, 0x00000f42}, // ar__ SPANISH.12
362 {{0x61,0x73,0x5f,0x5f}, 0x0f120bcd}, // as__ NORWEGIAN.10 CZECH.6 SPANISH.5
363 {{0x61,0x74,0x5f,0x5f}, 0x00000642}, // at__ GERMAN.12
364 {{0x61,0x77,0x5f,0x5f}, 0x0f000345}, // aw__ DUTCH.12 SPANISH.4
365 {{0x61,0x78,0x5f,0x5f}, 0x00001042}, // ax__ SWEDISH.12
366 {{0x61,0x7a,0x5f,0x5f}, 0x00004b42}, // az__ AZERBAIJANI.12
367 {{0x62,0x61,0x5f,0x5f}, 0x00001d42}, // ba__ CROATIAN.12
368 {{0x62,0x62,0x5f,0x5f}, 0x00002842}, // bb__ LATIN.12
369 {{0x62,0x64,0x5f,0x5f}, 0x00002642}, // bd__ BENGALI.12
370 {{0x62,0x65,0x5f,0x5f}, 0x05000335}, // be__ DUTCH.10 FRENCH.9
371 {{0x62,0x66,0x5f,0x5f}, 0x00000542}, // bf__ FRENCH.12
372 {{0x62,0x67,0x5f,0x5f}, 0x00001c42}, // bg__ BULGARIAN.12
373 {{0x62,0x68,0x5f,0x5f}, 0x00003742}, // bh__ ARABIC.12
374 {{0x62,0x69,0x5f,0x5f}, 0x0f00053f}, // bi__ FRENCH.11 SPANISH.9
375 {{0x62,0x6a,0x5f,0x5f}, 0x00000542}, // bj__ FRENCH.12
376 {{0x62,0x6d,0x5f,0x5f}, 0x98043929}, // bm__ ESPERANTO.9 FINNISH.8 SISWANT.6
377 {{0x62,0x6e,0x5f,0x5f}, 0x00002942}, // bn__ MALAY.12
378 {{0x62,0x6f,0x5f,0x5f}, 0x00000f42}, // bo__ SPANISH.12
379 {{0x62,0x72,0x5f,0x5f}, 0x00000d42}, // br__ PORTUGUESE.12
380 {{0x62,0x74,0x5f,0x5f}, 0x00008842}, // bt__ DZONGKHA.12
381 {{0x62,0x77,0x5f,0x5f}, 0x06059ac4}, // bw__ TSWANA.9 FRENCH.6 GERMAN.5
382 {{0x62,0x79,0x5f,0x5f}, 0x00003024}, // by__ BELARUSIAN.9
383 {{0x62,0x7a,0x5f,0x5f}, 0x0f0a0924}, // bz__ Japanese.9 Korean.5 SPANISH.1
384 {{0x63,0x61,0x5f,0x5f}, 0x00000542}, // ca__ FRENCH.12
385 {{0x63,0x61,0x74,0x5f}, 0x00003842}, // cat_ CATALAN.12
386 {{0x63,0x64,0x5f,0x5f}, 0x06051224}, // cd__ CZECH.9 FRENCH.5 GERMAN.1
387 {{0x63,0x66,0x5f,0x5f}, 0x00000542}, // cf__ FRENCH.12
388 {{0x63,0x67,0x5f,0x5f}, 0x00000542}, // cg__ FRENCH.12
389 {{0x63,0x68,0x5f,0x5f}, 0x08050638}, // ch__ GERMAN.11 FRENCH.7 ITALIAN.2
390 {{0x63,0x69,0x5f,0x5f}, 0x00000542}, // ci__ FRENCH.12
391 {{0x63,0x6c,0x5f,0x5f}, 0x00000f42}, // cl__ SPANISH.12
392 {{0x63,0x6d,0x5f,0x5f}, 0x00000542}, // cm__ FRENCH.12
393 {{0x63,0x6e,0x5f,0x5f}, 0x00001142}, // cn__ Chinese.12
394 {{0x63,0x6f,0x5f,0x5f}, 0x00000f42}, // co__ SPANISH.12
395 // {{0x63,0x6f,0x6f,0x70}, 0x0f0509cd}, // coop Japanese.10 FRENCH.6 SPANISH.5
396 {{0x63,0x72,0x5f,0x5f}, 0x00000f42}, // cr__ SPANISH.12
397 {{0x63,0x75,0x5f,0x5f}, 0x00000f42}, // cu__ SPANISH.12
398 {{0x63,0x76,0x5f,0x5f}, 0x00000d42}, // cv__ PORTUGUESE.12
399 {{0x63,0x78,0x5f,0x5f}, 0x223a091f}, // cx__ Japanese.8 BASQUE.6 TURKISH.4
400 {{0x63,0x79,0x5f,0x5f}, 0x150622ba}, // cy__ TURKISH.8 GERMAN.4 LATVIAN.3
401 {{0x63,0x7a,0x5f,0x5f}, 0x00001242}, // cz__ CZECH.12
402 {{0x64,0x65,0x5f,0x5f}, 0x00000642}, // de__ GERMAN.12
403 {{0x64,0x6b,0x5f,0x5f}, 0x00000242}, // dk__ DANISH.12
404 {{0x64,0x6f,0x5f,0x5f}, 0x21000f42}, // do__ SPANISH.12 TAGALOG.1
405 {{0x64,0x7a,0x5f,0x5f}, 0x37000535}, // dz__ FRENCH.10 ARABIC.9
406 {{0x65,0x63,0x5f,0x5f}, 0x00000f42}, // ec__ SPANISH.12
407 // {{0x65,0x64,0x75,0x5f}, 0x2e0f3873}, // edu_ CATALAN.9 SPANISH.7 ALBANIAN.2
408 {{0x65,0x65,0x5f,0x5f}, 0x00001942}, // ee__ ESTONIAN.12
409 {{0x65,0x67,0x5f,0x5f}, 0x05003742}, // eg__ ARABIC.12 FRENCH.1
410 {{0x65,0x72,0x5f,0x5f}, 0x00000b42}, // er__ NORWEGIAN.12
411 {{0x65,0x73,0x5f,0x5f}, 0x38200fd4}, // es__ SPANISH.11 GALICIAN.4 CATALAN.2
412 {{0x65,0x74,0x5f,0x5f}, 0x39004a39}, // et__ AMHARIC.11 ESPERANTO.3
413 {{0x66,0x69,0x5f,0x5f}, 0x10000444}, // fi__ FINNISH.12 SWEDISH.3
414 {{0x66,0x6a,0x5f,0x5f}, 0x050489e0}, // fj__ FIJIAN.12 FINNISH.5 FRENCH.3
415 {{0x66,0x6f,0x5f,0x5f}, 0x00004742}, // fo__ FAROESE.12
416 {{0x66,0x72,0x5f,0x5f}, 0x00000542}, // fr__ FRENCH.12
417 {{0x67,0x61,0x5f,0x5f}, 0x00000542}, // ga__ FRENCH.12
418 {{0x67,0x64,0x5f,0x5f}, 0x061d05d5}, // gd__ FRENCH.11 CROATIAN.5 GERMAN.3
419 {{0x67,0x65,0x5f,0x5f}, 0x00004c2d}, // ge__ GEORGIAN.10
420 {{0x67,0x66,0x5f,0x5f}, 0x00000542}, // gf__ FRENCH.12
421 {{0x67,0x67,0x5f,0x5f}, 0x06002244}, // gg__ TURKISH.12 GERMAN.3
422 {{0x67,0x68,0x5f,0x5f}, 0x05000436}, // gh__ FINNISH.10 FRENCH.10
423 {{0x67,0x69,0x5f,0x5f}, 0x0f0538ce}, // gi__ CATALAN.10 FRENCH.7 SPANISH.6
424 {{0x67,0x6c,0x5f,0x5f}, 0x398a0238}, // gl__ DANISH.11 GREENLANDIC.7 ESPERANTO.2
425 {{0x67,0x6d,0x5f,0x5f}, 0x0600043e}, // gm__ FINNISH.11 GERMAN.8
426 {{0x67,0x6e,0x5f,0x5f}, 0x00000542}, // gn__ FRENCH.12
427 // {{0x67,0x6f,0x76,0x5f}, 0x05000f25}, // gov_ SPANISH.9 FRENCH.2
428 {{0x67,0x70,0x5f,0x5f}, 0x00000542}, // gp__ FRENCH.12
429 {{0x67,0x71,0x5f,0x5f}, 0x0f000547}, // gq__ FRENCH.12 SPANISH.6
430 {{0x67,0x73,0x5f,0x5f}, 0x00000942}, // gs__ Japanese.12
431 {{0x67,0x74,0x5f,0x5f}, 0x00000f42}, // gt__ SPANISH.12
432 {{0x68,0x6b,0x5f,0x5f}, 0x11004643}, // hk__ ChineseT.12 Chinese.2
433 {{0x68,0x6d,0x5f,0x5f}, 0x4606092e}, // hm__ Japanese.10 GERMAN.6 ChineseT.2
434 {{0x68,0x6e,0x5f,0x5f}, 0x00000f42}, // hn__ SPANISH.12
435 {{0x68,0x72,0x5f,0x5f}, 0x00001d42}, // hr__ CROATIAN.12
436 {{0x68,0x74,0x5f,0x5f}, 0x0f000542}, // ht__ FRENCH.12 SPANISH.1
437 {{0x68,0x75,0x5f,0x5f}, 0x00001842}, // hu__ HUNGARIAN.12
438 {{0x69,0x64,0x5f,0x5f}, 0x00002742}, // id__ INDONESIAN.12
439 {{0x69,0x65,0x5f,0x5f}, 0x050c1f24}, // ie__ IRISH.9 POLISH.5 FRENCH.1
440 {{0x69,0x6c,0x5f,0x5f}, 0x00000742}, // il__ HEBREW.12
441 {{0x69,0x6e,0x74,0x5f}, 0x0f060574}, // int_ FRENCH.9 GERMAN.8 SPANISH.3
442 {{0x69,0x6f,0x5f,0x5f}, 0x11090fd5}, // io__ SPANISH.11 Japanese.5 Chinese.3
443 {{0x69,0x71,0x5f,0x5f}, 0x60003744}, // iq__ ARABIC.12 KURDISH.3
444 {{0x69,0x72,0x5f,0x5f}, 0x00004e42}, // ir__ PERSIAN.12
445 {{0x69,0x73,0x5f,0x5f}, 0x00001442}, // is__ ICELANDIC.12
446 {{0x69,0x74,0x5f,0x5f}, 0x00000842}, // it__ ITALIAN.12
447 {{0x6a,0x65,0x5f,0x5f}, 0x29050328}, // je__ DUTCH.9 FRENCH.7 MALAY.5
448 {{0x6a,0x6d,0x5f,0x5f}, 0x040f0576}, // jm__ FRENCH.9 SPANISH.8 FINNISH.5
449 {{0x6a,0x6f,0x5f,0x5f}, 0x00003742}, // jo__ ARABIC.12
450 // {{0x6a,0x6f,0x62,0x73}, 0x0f060329}, // jobs DUTCH.9 GERMAN.8 SPANISH.6
451 {{0x6a,0x70,0x5f,0x5f}, 0x00000942}, // jp__ Japanese.12
452 {{0x6b,0x65,0x5f,0x5f}, 0x040f3fc3}, // ke__ SWAHILI.9 SPANISH.5 FINNISH.4
453 {{0x6b,0x69,0x5f,0x5f}, 0x04000643}, // ki__ GERMAN.12 FINNISH.2
454 {{0x6b,0x6d,0x5f,0x5f}, 0x00000542}, // km__ FRENCH.12
455 {{0x6b,0x70,0x5f,0x5f}, 0x00000a42}, // kp__ Korean.12
456 {{0x6b,0x72,0x5f,0x5f}, 0x00000a42}, // kr__ Korean.12
457 {{0x6b,0x77,0x5f,0x5f}, 0x00003742}, // kw__ ARABIC.12
458 {{0x6b,0x79,0x5f,0x5f}, 0x0500083f}, // ky__ ITALIAN.11 FRENCH.9
459 {{0x6b,0x7a,0x5f,0x5f}, 0x0000732d}, // kz__ KAZAKH.10
460 {{0x6c,0x62,0x5f,0x5f}, 0x05003747}, // lb__ ARABIC.12 FRENCH.6
461 {{0x6c,0x63,0x5f,0x5f}, 0x09000645}, // lc__ GERMAN.12 Japanese.4
462 {{0x6c,0x69,0x5f,0x5f}, 0x1600063d}, // li__ GERMAN.11 LITHUANIAN.7
463 {{0x6c,0x73,0x5f,0x5f}, 0x00005742}, // ls__ SESOTHO.12
464 {{0x6c,0x74,0x5f,0x5f}, 0x00001642}, // lt__ LITHUANIAN.12
465 {{0x6c,0x75,0x5f,0x5f}, 0x0600053d}, // lu__ FRENCH.11 GERMAN.7
466 {{0x6c,0x76,0x5f,0x5f}, 0x00001542}, // lv__ LATVIAN.12
467 {{0x6c,0x79,0x5f,0x5f}, 0x05003744}, // ly__ ARABIC.12 FRENCH.3
468 {{0x6d,0x61,0x5f,0x5f}, 0x3700053d}, // ma__ FRENCH.11 ARABIC.7
469 {{0x6d,0x63,0x5f,0x5f}, 0x00000542}, // mc__ FRENCH.12
470 {{0x6d,0x64,0x5f,0x5f}, 0x00001724}, // md__ ROMANIAN.9
471 {{0x6d,0x65,0x5f,0x5f}, 0x00001d42}, // me__ CROATIAN.12
472 {{0x6d,0x67,0x5f,0x5f}, 0x00000542}, // mg__ FRENCH.12
473 {{0x6d,0x6b,0x5f,0x5f}, 0x1c002543}, // mk__ MACEDONIAN.12 BULGARIAN.2
474 {{0x6d,0x6c,0x5f,0x5f}, 0x00000542}, // ml__ FRENCH.12
475 {{0x6d,0x6e,0x5f,0x5f}, 0x00006142}, // mn__ MONGOLIAN.12
476 {{0x6d,0x6f,0x5f,0x5f}, 0x110d4631}, // mo__ ChineseT.10 PORTUGUESE.8 Chinese.5
477 {{0x6d,0x71,0x5f,0x5f}, 0x00000542}, // mq__ FRENCH.12
478 {{0x6d,0x72,0x5f,0x5f}, 0x37000535}, // mr__ FRENCH.10 ARABIC.9
479 {{0x6d,0x73,0x5f,0x5f}, 0x090f06d5}, // ms__ GERMAN.11 SPANISH.5 Japanese.3
480 {{0x6d,0x74,0x5f,0x5f}, 0x00004242}, // mt__ MALTESE.12
481 {{0x6d,0x75,0x5f,0x5f}, 0x05000934}, // mu__ Japanese.10 FRENCH.8
482 {{0x6d,0x76,0x5f,0x5f}, 0x28000436}, // mv__ FINNISH.10 LATIN.10
483 {{0x6d,0x77,0x5f,0x5f}, 0x0611092a}, // mw__ Japanese.9 Chinese.8 GERMAN.7
484 {{0x6d,0x78,0x5f,0x5f}, 0x00000f42}, // mx__ SPANISH.12
485 {{0x6d,0x79,0x5f,0x5f}, 0x00002942}, // my__ MALAY.12
486 {{0x6d,0x7a,0x5f,0x5f}, 0x00000d42}, // mz__ PORTUGUESE.12
487 {{0x6e,0x61,0x5f,0x5f}, 0x06006644}, // na__ AFRIKAANS.12 GERMAN.3
488 {{0x6e,0x63,0x5f,0x5f}, 0x00000542}, // nc__ FRENCH.12
489 {{0x6e,0x65,0x5f,0x5f}, 0x8b000542}, // ne__ FRENCH.12 HAUSA.1
490 {{0x6e,0x66,0x5f,0x5f}, 0x00000542}, // nf__ FRENCH.12
491 {{0x6e,0x69,0x5f,0x5f}, 0x00000f42}, // ni__ SPANISH.12
492 {{0x6e,0x6c,0x5f,0x5f}, 0x00000342}, // nl__ DUTCH.12
493 {{0x6e,0x6f,0x5f,0x5f}, 0x51000b43}, // no__ NORWEGIAN.12 NORWEGIAN_N.2
494 {{0x6e,0x75,0x5f,0x5f}, 0x0300103b}, // nu__ SWEDISH.11 DUTCH.5
495 {{0x6f,0x6d,0x5f,0x5f}, 0x00003742}, // om__ ARABIC.12
496 {{0x70,0x61,0x5f,0x5f}, 0x00000f42}, // pa__ SPANISH.12
497 {{0x70,0x65,0x5f,0x5f}, 0x00000f42}, // pe__ SPANISH.12
498 {{0x70,0x66,0x5f,0x5f}, 0x00000542}, // pf__ FRENCH.12
499 {{0x70,0x67,0x5f,0x5f}, 0x00000f24}, // pg__ SPANISH.9
500 {{0x70,0x68,0x5f,0x5f}, 0x00002142}, // ph__ TAGALOG.12
501 {{0x70,0x6b,0x5f,0x5f}, 0x00003342}, // pk__ URDU.12
502 {{0x70,0x6c,0x5f,0x5f}, 0x30000c42}, // pl__ POLISH.12 BELARUSIAN.1
503 {{0x70,0x6e,0x5f,0x5f}, 0x04000644}, // pn__ GERMAN.12 FINNISH.3
504 {{0x70,0x72,0x5f,0x5f}, 0x00000f42}, // pr__ SPANISH.12
505 {{0x70,0x72,0x6f,0x5f}, 0x46050fd5}, // pro_ SPANISH.11 FRENCH.5 ChineseT.3
506 {{0x70,0x73,0x5f,0x5f}, 0x00003742}, // ps__ ARABIC.12
507 {{0x70,0x74,0x5f,0x5f}, 0x00000d42}, // pt__ PORTUGUESE.12
508 {{0x70,0x79,0x5f,0x5f}, 0x00000f42}, // py__ SPANISH.12
509 {{0x71,0x61,0x5f,0x5f}, 0x00003742}, // qa__ ARABIC.12
510 {{0x72,0x65,0x5f,0x5f}, 0x00000542}, // re__ FRENCH.12
511 {{0x72,0x6f,0x5f,0x5f}, 0x00001742}, // ro__ ROMANIAN.12
512 {{0x72,0x73,0x5f,0x5f}, 0x00001d42}, // rs__ CROATIAN.12
513 {{0x72,0x77,0x5f,0x5f}, 0x9000053e}, // rw__ FRENCH.11 KINYARWANDA.8
514 {{0x73,0x61,0x5f,0x5f}, 0x00003742}, // sa__ ARABIC.12
515 {{0x73,0x62,0x5f,0x5f}, 0x00000442}, // sb__ FINNISH.12
516 {{0x73,0x63,0x5f,0x5f}, 0x060f092f}, // sc__ Japanese.10 SPANISH.7 GERMAN.3
517 {{0x73,0x64,0x5f,0x5f}, 0x00003742}, // sd__ ARABIC.12
518 {{0x73,0x65,0x5f,0x5f}, 0x00001042}, // se__ SWEDISH.12
519 {{0x73,0x69,0x5f,0x5f}, 0x00004042}, // si__ SLOVENIAN.12
520 {{0x73,0x6b,0x5f,0x5f}, 0x12004543}, // sk__ SLOVAK.12 CZECH.2
521 {{0x73,0x6d,0x5f,0x5f}, 0x00000842}, // sm__ ITALIAN.12
522 {{0x73,0x6e,0x5f,0x5f}, 0x00000542}, // sn__ FRENCH.12
523 {{0x73,0x72,0x5f,0x5f}, 0x03001e44}, // sr__ SERBIAN.12 DUTCH.3
524 {{0x73,0x76,0x5f,0x5f}, 0x00000f42}, // sv__ SPANISH.12
525 {{0x73,0x79,0x5f,0x5f}, 0x00003742}, // sy__ ARABIC.12
526 {{0x74,0x63,0x5f,0x5f}, 0x0a2206cd}, // tc__ GERMAN.10 TURKISH.6 Korean.5
527 {{0x74,0x66,0x5f,0x5f}, 0x00000642}, // tf__ GERMAN.12
528 {{0x74,0x67,0x5f,0x5f}, 0x00000542}, // tg__ FRENCH.12
529 {{0x74,0x68,0x5f,0x5f}, 0x9e0936c9}, // th__ THAI.10 Japanese.3 SCOTS.1
530 {{0x74,0x6a,0x5f,0x5f}, 0x00007924}, // tj__ TAJIK.9
531 {{0x74,0x6c,0x5f,0x5f}, 0x060f0dcd}, // tl__ PORTUGUESE.10 SPANISH.6 GERMAN.5
532 {{0x74,0x6e,0x5f,0x5f}, 0x3700053e}, // tn__ FRENCH.11 ARABIC.8
533 {{0x74,0x6f,0x5f,0x5f}, 0x064609c5}, // to__ Japanese.9 ChineseT.7 GERMAN.6
534 {{0x74,0x70,0x5f,0x5f}, 0x06000944}, // tp__ Japanese.12 GERMAN.3
535 {{0x74,0x72,0x5f,0x5f}, 0x00002242}, // tr__ TURKISH.12
536 {{0x74,0x72,0x61,0x76}, 0x064509c3}, // trav Japanese.9 SLOVAK.5 GERMAN.4
537 {{0x74,0x74,0x5f,0x5f}, 0x0f00063e}, // tt__ GERMAN.11 SPANISH.8
538 {{0x74,0x77,0x5f,0x5f}, 0x00004642}, // tw__ ChineseT.12
539 {{0x74,0x7a,0x5f,0x5f}, 0x00003f42}, // tz__ SWAHILI.12
540 {{0x75,0x61,0x5f,0x5f}, 0x0000232d}, // ua__ UKRAINIAN.10
541 {{0x75,0x79,0x5f,0x5f}, 0x00000f42}, // uy__ SPANISH.12
542 {{0x75,0x7a,0x5f,0x5f}, 0x0000492d}, // uz__ UZBEK.10
543 {{0x76,0x61,0x5f,0x5f}, 0x060f0828}, // va__ ITALIAN.9 SPANISH.7 GERMAN.5
544 {{0x76,0x63,0x5f,0x5f}, 0x0d000939}, // vc__ Japanese.11 PORTUGUESE.3
545 {{0x76,0x65,0x5f,0x5f}, 0x00000f42}, // ve__ SPANISH.12
546 {{0x76,0x67,0x5f,0x5f}, 0x09000f43}, // vg__ SPANISH.12 Japanese.2
547 {{0x76,0x69,0x5f,0x5f}, 0x00002942}, // vi__ MALAY.12
548 {{0x76,0x6e,0x5f,0x5f}, 0x00004342}, // vn__ VIETNAMESE.12
549 {{0x76,0x75,0x5f,0x5f}, 0x00000642}, // vu__ GERMAN.12
550 {{0x77,0x73,0x5f,0x5f}, 0x4b0f0624}, // ws__ GERMAN.9 SPANISH.5 AZERBAIJANI.1
551 {{0x79,0x65,0x5f,0x5f}, 0x00003742}, // ye__ ARABIC.12
552 {{0x79,0x75,0x5f,0x5f}, 0x1e001d3d}, // yu__ CROATIAN.11 SERBIAN.7
553 {{0x7a,0x61,0x5f,0x5f}, 0x00006642}, // za__ AFRIKAANS.12
554 {{0x7a,0x6d,0x5f,0x5f}, 0x0b000435}, // zm__ FINNISH.10 NORWEGIAN.9
555 {{0x7a,0x77,0x5f,0x5f}, 0x3f00783e}, // zw__ SHONA.11 SWAHILI.8
559 // Statistically closest language, based on quadgram table
560 // Those that are far from other languges map to UNKNOWN_LANGUAGE
561 // Subscripted by Language
563 // From lang_correlation.txt and hand-edits
564 // sed 's/^\([^ ]*\) \([^ ]*\) coef=0\.\(..\).*$/
565 // (\3 >= kMinCorrPercent) ? \2 : UNKNOWN_LANGUAGE,
566 // \/\/ \1/' lang_correlation.txt >/tmp/closest_lang_decl.txt
568 static const int kMinCorrPercent = 24; // Pick off how close you want
569 // 24 catches PERSIAN <== ARABIC
570 // but not SPANISH <== PORTUGESE
571 static Language Unknown = UNKNOWN_LANGUAGE;
573 // Subscripted by Language
574 static const Language kClosestAltLanguage[] = {
575 (28 >= kMinCorrPercent) ? SCOTS : UNKNOWN_LANGUAGE, // ENGLISH
576 (36 >= kMinCorrPercent) ? NORWEGIAN : UNKNOWN_LANGUAGE, // DANISH
577 (31 >= kMinCorrPercent) ? AFRIKAANS : UNKNOWN_LANGUAGE, // DUTCH
578 (15 >= kMinCorrPercent) ? ESTONIAN : UNKNOWN_LANGUAGE, // FINNISH
579 (11 >= kMinCorrPercent) ? OCCITAN : UNKNOWN_LANGUAGE, // FRENCH
580 (17 >= kMinCorrPercent) ? LUXEMBOURGISH : UNKNOWN_LANGUAGE, // GERMAN
581 (27 >= kMinCorrPercent) ? YIDDISH : UNKNOWN_LANGUAGE, // HEBREW
582 (16 >= kMinCorrPercent) ? CORSICAN : UNKNOWN_LANGUAGE, // ITALIAN
583 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // Japanese
584 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // Korean
585 (41 >= kMinCorrPercent) ? NORWEGIAN_N : UNKNOWN_LANGUAGE, // NORWEGIAN
586 ( 5 >= kMinCorrPercent) ? SLOVAK : UNKNOWN_LANGUAGE, // POLISH
587 (23 >= kMinCorrPercent) ? SPANISH : UNKNOWN_LANGUAGE, // PORTUGUESE
588 (33 >= kMinCorrPercent) ? BULGARIAN : UNKNOWN_LANGUAGE, // RUSSIAN
589 (28 >= kMinCorrPercent) ? GALICIAN : UNKNOWN_LANGUAGE, // SPANISH
590 (17 >= kMinCorrPercent) ? NORWEGIAN : UNKNOWN_LANGUAGE, // SWEDISH
591 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // Chinese
592 (42 >= kMinCorrPercent) ? SLOVAK : UNKNOWN_LANGUAGE, // CZECH
593 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // GREEK
594 (35 >= kMinCorrPercent) ? FAROESE : UNKNOWN_LANGUAGE, // ICELANDIC
595 ( 7 >= kMinCorrPercent) ? LITHUANIAN : UNKNOWN_LANGUAGE, // LATVIAN
596 ( 7 >= kMinCorrPercent) ? LATVIAN : UNKNOWN_LANGUAGE, // LITHUANIAN
597 ( 4 >= kMinCorrPercent) ? LATIN : UNKNOWN_LANGUAGE, // ROMANIAN
598 ( 4 >= kMinCorrPercent) ? SLOVAK : UNKNOWN_LANGUAGE, // HUNGARIAN
599 (15 >= kMinCorrPercent) ? FINNISH : UNKNOWN_LANGUAGE, // ESTONIAN
600 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // Ignore
601 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // Unknown
602 (33 >= kMinCorrPercent) ? RUSSIAN : UNKNOWN_LANGUAGE, // BULGARIAN
603 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CROATIAN
604 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // SERBIAN
605 (24 >= kMinCorrPercent) ? SCOTS_GAELIC : UNKNOWN_LANGUAGE, // IRISH
606 (28 >= kMinCorrPercent) ? SPANISH : UNKNOWN_LANGUAGE, // GALICIAN
607 ( 8 >= kMinCorrPercent) ? INDONESIAN : UNKNOWN_LANGUAGE, // TAGALOG
608 (29 >= kMinCorrPercent) ? AZERBAIJANI : UNKNOWN_LANGUAGE, // TURKISH
609 (28 >= kMinCorrPercent) ? RUSSIAN : UNKNOWN_LANGUAGE, // UKRAINIAN
610 (37 >= kMinCorrPercent) ? MARATHI : UNKNOWN_LANGUAGE, // HINDI
611 (29 >= kMinCorrPercent) ? BULGARIAN : UNKNOWN_LANGUAGE, // MACEDONIAN
612 (14 >= kMinCorrPercent) ? ASSAMESE : UNKNOWN_LANGUAGE, // BENGALI
613 (46 >= kMinCorrPercent) ? MALAY : UNKNOWN_LANGUAGE, // INDONESIAN
614 ( 9 >= kMinCorrPercent) ? INTERLINGUA : UNKNOWN_LANGUAGE, // LATIN
615 (46 >= kMinCorrPercent) ? INDONESIAN : UNKNOWN_LANGUAGE, // MALAY
616 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // MALAYALAM
617 ( 4 >= kMinCorrPercent) ? BRETON : UNKNOWN_LANGUAGE, // WELSH
618 ( 8 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE, // NEPALI
619 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // TELUGU
620 ( 3 >= kMinCorrPercent) ? ESPERANTO : UNKNOWN_LANGUAGE, // ALBANIAN
621 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // TAMIL
622 (22 >= kMinCorrPercent) ? UKRAINIAN : UNKNOWN_LANGUAGE, // BELARUSIAN
623 (15 >= kMinCorrPercent) ? SUNDANESE : UNKNOWN_LANGUAGE, // JAVANESE
624 (19 >= kMinCorrPercent) ? CATALAN : UNKNOWN_LANGUAGE, // OCCITAN
625 (27 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE, // URDU
626 (36 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE, // BIHARI
627 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // GUJARATI
628 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // THAI
629 (24 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE, // ARABIC
630 (19 >= kMinCorrPercent) ? OCCITAN : UNKNOWN_LANGUAGE, // CATALAN
631 ( 4 >= kMinCorrPercent) ? LATIN : UNKNOWN_LANGUAGE, // ESPERANTO
632 ( 3 >= kMinCorrPercent) ? GERMAN : UNKNOWN_LANGUAGE, // BASQUE
633 ( 9 >= kMinCorrPercent) ? LATIN : UNKNOWN_LANGUAGE, // INTERLINGUA
634 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // KANNADA
635 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // PUNJABI
636 (24 >= kMinCorrPercent) ? IRISH : UNKNOWN_LANGUAGE, // SCOTS_GAELIC
637 ( 7 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE, // SWAHILI
638 (28 >= kMinCorrPercent) ? SERBO_CROATIAN : UNKNOWN_LANGUAGE, // SLOVENIAN
639 (37 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE, // MARATHI
640 ( 3 >= kMinCorrPercent) ? ITALIAN : UNKNOWN_LANGUAGE, // MALTESE
641 ( 1 >= kMinCorrPercent) ? YORUBA : UNKNOWN_LANGUAGE, // VIETNAMESE
642 (15 >= kMinCorrPercent) ? DUTCH : UNKNOWN_LANGUAGE, // FRISIAN
643 (42 >= kMinCorrPercent) ? CZECH : UNKNOWN_LANGUAGE, // SLOVAK
644 // Original ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // ChineseT
645 (24 >= kMinCorrPercent) ? CHINESE : UNKNOWN_LANGUAGE, // ChineseT
646 (35 >= kMinCorrPercent) ? ICELANDIC : UNKNOWN_LANGUAGE, // FAROESE
647 (15 >= kMinCorrPercent) ? JAVANESE : UNKNOWN_LANGUAGE, // SUNDANESE
648 (17 >= kMinCorrPercent) ? TAJIK : UNKNOWN_LANGUAGE, // UZBEK
649 ( 7 >= kMinCorrPercent) ? TIGRINYA : UNKNOWN_LANGUAGE, // AMHARIC
650 (29 >= kMinCorrPercent) ? TURKISH : UNKNOWN_LANGUAGE, // AZERBAIJANI
651 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // GEORGIAN
652 ( 7 >= kMinCorrPercent) ? AMHARIC : UNKNOWN_LANGUAGE, // TIGRINYA
653 (27 >= kMinCorrPercent) ? URDU : UNKNOWN_LANGUAGE, // PERSIAN
654 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // BOSNIAN
655 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // SINHALESE
656 (41 >= kMinCorrPercent) ? NORWEGIAN : UNKNOWN_LANGUAGE, // NORWEGIAN_N
657 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // PORTUGUESE_P
658 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // PORTUGUESE_B
659 (37 >= kMinCorrPercent) ? ZULU : UNKNOWN_LANGUAGE, // XHOSA
660 (37 >= kMinCorrPercent) ? XHOSA : UNKNOWN_LANGUAGE, // ZULU
661 ( 2 >= kMinCorrPercent) ? SPANISH : UNKNOWN_LANGUAGE, // GUARANI
662 (29 >= kMinCorrPercent) ? TSWANA : UNKNOWN_LANGUAGE, // SESOTHO
663 ( 7 >= kMinCorrPercent) ? TURKISH : UNKNOWN_LANGUAGE, // TURKMEN
664 ( 8 >= kMinCorrPercent) ? KAZAKH : UNKNOWN_LANGUAGE, // KYRGYZ
665 ( 5 >= kMinCorrPercent) ? FRENCH : UNKNOWN_LANGUAGE, // BRETON
666 ( 3 >= kMinCorrPercent) ? GANDA : UNKNOWN_LANGUAGE, // TWI
667 (27 >= kMinCorrPercent) ? HEBREW : UNKNOWN_LANGUAGE, // YIDDISH
668 (28 >= kMinCorrPercent) ? SLOVENIAN : UNKNOWN_LANGUAGE, // SERBO_CROATIAN
669 (12 >= kMinCorrPercent) ? OROMO : UNKNOWN_LANGUAGE, // SOMALI
670 ( 9 >= kMinCorrPercent) ? UZBEK : UNKNOWN_LANGUAGE, // UIGHUR
671 (15 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE, // KURDISH
672 ( 6 >= kMinCorrPercent) ? KYRGYZ : UNKNOWN_LANGUAGE, // MONGOLIAN
673 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // ARMENIAN
674 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // LAOTHIAN
675 ( 8 >= kMinCorrPercent) ? URDU : UNKNOWN_LANGUAGE, // SINDHI
676 (10 >= kMinCorrPercent) ? ITALIAN : UNKNOWN_LANGUAGE, // RHAETO_ROMANCE
677 (31 >= kMinCorrPercent) ? DUTCH : UNKNOWN_LANGUAGE, // AFRIKAANS
678 (17 >= kMinCorrPercent) ? GERMAN : UNKNOWN_LANGUAGE, // LUXEMBOURGISH
679 ( 2 >= kMinCorrPercent) ? SCOTS : UNKNOWN_LANGUAGE, // BURMESE
680 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // KHMER
681 (45 >= kMinCorrPercent) ? DZONGKHA : UNKNOWN_LANGUAGE, // TIBETAN
682 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // DHIVEHI
683 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CHEROKEE
684 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // SYRIAC
685 ( 8 >= kMinCorrPercent) ? DUTCH : UNKNOWN_LANGUAGE, // LIMBU
686 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // ORIYA
687 (14 >= kMinCorrPercent) ? BENGALI : UNKNOWN_LANGUAGE, // ASSAMESE
688 (16 >= kMinCorrPercent) ? ITALIAN : UNKNOWN_LANGUAGE, // CORSICAN
689 ( 5 >= kMinCorrPercent) ? INTERLINGUA : UNKNOWN_LANGUAGE, // INTERLINGUE
690 ( 8 >= kMinCorrPercent) ? KYRGYZ : UNKNOWN_LANGUAGE, // KAZAKH
691 ( 4 >= kMinCorrPercent) ? SWAHILI : UNKNOWN_LANGUAGE, // LINGALA
692 (11 >= kMinCorrPercent) ? RUSSIAN : UNKNOWN_LANGUAGE, // MOLDAVIAN
693 (19 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE, // PASHTO
694 ( 5 >= kMinCorrPercent) ? AYMARA : UNKNOWN_LANGUAGE, // QUECHUA
695 ( 5 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE, // SHONA
696 (17 >= kMinCorrPercent) ? UZBEK : UNKNOWN_LANGUAGE, // TAJIK
697 (13 >= kMinCorrPercent) ? BASHKIR : UNKNOWN_LANGUAGE, // TATAR
698 (11 >= kMinCorrPercent) ? SAMOAN : UNKNOWN_LANGUAGE, // TONGA
699 ( 2 >= kMinCorrPercent) ? TWI : UNKNOWN_LANGUAGE, // YORUBA
700 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CREOLES_AND_PIDGINS_ENGLISH_BASED
701 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CREOLES_AND_PIDGINS_FRENCH_BASED
702 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CREOLES_AND_PIDGINS_PORTUGUESE_BASED
703 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CREOLES_AND_PIDGINS_OTHER
704 ( 6 >= kMinCorrPercent) ? TONGA : UNKNOWN_LANGUAGE, // MAORI
705 ( 3 >= kMinCorrPercent) ? OROMO : UNKNOWN_LANGUAGE, // WOLOF
706 ( 1 >= kMinCorrPercent) ? MONGOLIAN : UNKNOWN_LANGUAGE, // ABKHAZIAN
707 ( 8 >= kMinCorrPercent) ? SOMALI : UNKNOWN_LANGUAGE, // AFAR
708 ( 5 >= kMinCorrPercent) ? QUECHUA : UNKNOWN_LANGUAGE, // AYMARA
709 (13 >= kMinCorrPercent) ? TATAR : UNKNOWN_LANGUAGE, // BASHKIR
710 ( 3 >= kMinCorrPercent) ? ENGLISH : UNKNOWN_LANGUAGE, // BISLAMA
711 (45 >= kMinCorrPercent) ? TIBETAN : UNKNOWN_LANGUAGE, // DZONGKHA
712 ( 4 >= kMinCorrPercent) ? TONGA : UNKNOWN_LANGUAGE, // FIJIAN
713 ( 7 >= kMinCorrPercent) ? INUPIAK : UNKNOWN_LANGUAGE, // GREENLANDIC
714 ( 3 >= kMinCorrPercent) ? AFAR : UNKNOWN_LANGUAGE, // HAUSA
715 ( 3 >= kMinCorrPercent) ? OCCITAN : UNKNOWN_LANGUAGE, // HAITIAN_CREOLE
716 ( 7 >= kMinCorrPercent) ? GREENLANDIC : UNKNOWN_LANGUAGE, // INUPIAK
717 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // INUKTITUT
718 ( 4 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE, // KASHMIRI
719 (30 >= kMinCorrPercent) ? RUNDI : UNKNOWN_LANGUAGE, // KINYARWANDA
720 ( 2 >= kMinCorrPercent) ? TAGALOG : UNKNOWN_LANGUAGE, // MALAGASY
721 (17 >= kMinCorrPercent) ? GERMAN : UNKNOWN_LANGUAGE, // NAURU
722 (12 >= kMinCorrPercent) ? SOMALI : UNKNOWN_LANGUAGE, // OROMO
723 (30 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE, // RUNDI
724 (11 >= kMinCorrPercent) ? TONGA : UNKNOWN_LANGUAGE, // SAMOAN
725 ( 1 >= kMinCorrPercent) ? LINGALA : UNKNOWN_LANGUAGE, // SANGO
726 (32 >= kMinCorrPercent) ? MARATHI : UNKNOWN_LANGUAGE, // SANSKRIT
727 (16 >= kMinCorrPercent) ? ZULU : UNKNOWN_LANGUAGE, // SISWANT
728 ( 5 >= kMinCorrPercent) ? SISWANT : UNKNOWN_LANGUAGE, // TSONGA
729 (29 >= kMinCorrPercent) ? SESOTHO : UNKNOWN_LANGUAGE, // TSWANA
730 ( 2 >= kMinCorrPercent) ? ESTONIAN : UNKNOWN_LANGUAGE, // VOLAPUK
731 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // ZHUANG
732 ( 1 >= kMinCorrPercent) ? MALAY : UNKNOWN_LANGUAGE, // KHASI
733 (28 >= kMinCorrPercent) ? ENGLISH : UNKNOWN_LANGUAGE, // SCOTS
734 (15 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE, // GANDA
735 ( 7 >= kMinCorrPercent) ? ENGLISH : UNKNOWN_LANGUAGE, // MANX
736 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // MONTENEGRIN
739 COMPILE_ASSERT(arraysize(kClosestAltLanguage) == NUM_LANGUAGES,
740 kClosestAltLanguage_has_incorrect_size);
743 inline bool FlagFinish(int flags) {return (flags & kCLDFlagFinish) != 0;}
744 inline bool FlagSqueeze(int flags) {return (flags & kCLDFlagSqueeze) != 0;}
745 inline bool FlagRepeats(int flags) {return (flags & kCLDFlagRepeats) != 0;}
746 inline bool FlagTop40(int flags) {return (flags & kCLDFlagTop40) != 0;}
747 inline bool FlagShort(int flags) {return (flags & kCLDFlagShort) != 0;}
748 inline bool FlagHint(int flags) {return (flags & kCLDFlagHint) != 0;}
749 inline bool FlagUseWords(int flags) {return (flags & kCLDFlagUseWords) != 0;}
754 //------------------------------------------------------------------------------
755 // For --cld_html debugging output. Not thread safe
756 //------------------------------------------------------------------------------
757 static Language prior_lang = UNKNOWN_LANGUAGE;
758 static bool prior_unreliable = false;
760 //------------------------------------------------------------------------------
761 // End For --cld_html debugging output
762 //------------------------------------------------------------------------------
765 // Backscan to word boundary, returning how many bytes n to go back
766 // so that src - n is non-space ans src - n - 1 is space.
767 // If not found in kMaxSpaceScan bytes, return 0
768 int BackscanToSpace(const char* src, int limit) {
769 int n = 0;
770 limit = cld::minint(limit, kMaxSpaceScan);
771 while (n < limit) {
772 if (src[-n - 1] == ' ') {return n;} // We are at _X
773 ++n;
775 return 0;
778 // Forwardscan to word boundary, returning how many bytes n to go forward
779 // so that src + n is non-space ans src + n - 1 is space.
780 // If not found in kMaxSpaceScan bytes, return 0
781 int ForwardscanToSpace(const char* src, int limit) {
782 int n = 0;
783 limit = cld::minint(limit, kMaxSpaceScan);
784 while (n < limit) {
785 if (src[n] == ' ') {return n + 1;} // We are at _X
786 ++n;
788 return 0;
792 // This uses a cheap predictor to get a measure of compression, and
793 // hence a measure of repetitiveness. It works on complete UTF-8 characters
794 // instead of bytes, because three-byte UTF-8 Indic, etc. text compress highly
795 // all the time when done with a byte-based count. Sigh.
797 // To allow running prediction across multiple chunks, caller passes in current
798 // 12-bit hash value and int[4096] prediction table. Caller inits these to 0.
800 // Returns the number of *bytes* correctly predicted, increments by 1..4 for
801 // each correctly-predicted character.
803 // NOTE: Overruns by up to three bytes. Not a problem with valid UTF-8 text
805 int CountPredictedBytes(const char* isrc, int srclen, int* hash, int* tbl) {
806 int p_count = 0;
807 const uint8* src = reinterpret_cast<const uint8*>(isrc);
808 const uint8* srclimit = src + srclen;
809 int local_hash = *hash;
811 while (src < srclimit) {
812 int c = src[0];
813 int incr = 1;
815 // Pick up one char and length
816 if (c < 0xc0) {
817 // One-byte or continuation byte: 00xxxxxx 01xxxxxx 10xxxxxx
818 // Do nothing more
819 } else if ((c & 0xe0) == 0xc0) {
820 // Two-byte
821 c = (c << 8) | src[1];
822 incr = 2;
823 } else if ((c & 0xf0) == 0xe0) {
824 // Three-byte
825 c = (c << 16) | (src[1] << 8) | src[2];
826 incr = 3;
827 } else {
828 // Four-byte
829 c = (c << 24) | (src[1] << 16) | (src[2] << 8) | src[3];
830 incr = 4;
832 src += incr;
834 int p = tbl[local_hash]; // Prediction
835 tbl[local_hash] = c; // Update prediction
836 p_count += (c == p); // Count good predictions
838 local_hash = ((local_hash << 4) ^ c) & 0xfff;
841 *hash = local_hash;
842 return p_count;
847 // Counts number of spaces; a little faster than one-at-a-time
848 // Doesn't count odd bytes at end
849 int CountSpaces4(const char* src, int src_len) {
850 int s_count = 0;
851 for (int i = 0; i < (src_len & ~3); i += 4) {
852 s_count += (src[i] == ' ');
853 s_count += (src[i+1] == ' ');
854 s_count += (src[i+2] == ' ');
855 s_count += (src[i+3] == ' ');
857 return s_count;
860 // Remove words of text that have more than half their letters predicted
861 // correctly by our cheap predictor, moving the remaining words in-place
862 // to the front of the input buffer.
864 // To allow running prediction across multiple chunks, caller passes in current
865 // 12-bit hash value and int[4096] prediction table. Caller inits these to 0.
867 // Return the new, possibly-shorter length
869 // Result Buffer ALWAYS has leading space and trailing space space space NUL,
870 // if input does
872 int CheapRepWordsInplace(char* isrc, int srclen, int* hash, int* tbl) {
873 const uint8* src = reinterpret_cast<const uint8*>(isrc);
874 const uint8* srclimit = src + srclen;
875 char* dst = isrc;
876 int local_hash = *hash;
877 char* word_dst = dst; // Start of next word
878 int good_predict_bytes = 0;
879 int word_length_bytes = 0;
881 while (src < srclimit) {
882 int c = src[0];
883 int incr = 1;
884 *dst++ = c;
886 if (c == ' ') {
887 if ((good_predict_bytes * 2) > word_length_bytes) {
888 // Word is well-predicted: backup to start of this word
889 dst = word_dst;
890 if (FLAGS_cld_showme) {
891 // Mark the deletion point with period
892 // Don't repeat multiple periods
893 // Cannot mark with more bytes or may overwrite unseen input
894 if ((isrc < (dst - 2)) && (dst[-2] != '.')) {
895 *dst++ = '.';
896 *dst++ = ' ';
900 word_dst = dst; // Start of next word
901 good_predict_bytes = 0;
902 word_length_bytes = 0;
905 // Pick up one char and length
906 if (c < 0xc0) {
907 // One-byte or continuation byte: 00xxxxxx 01xxxxxx 10xxxxxx
908 // Do nothing more
909 } else if ((c & 0xe0) == 0xc0) {
910 // Two-byte
911 *dst++ = src[1];
912 c = (c << 8) | src[1];
913 incr = 2;
914 } else if ((c & 0xf0) == 0xe0) {
915 // Three-byte
916 *dst++ = src[1];
917 *dst++ = src[2];
918 c = (c << 16) | (src[1] << 8) | src[2];
919 incr = 3;
920 } else {
921 // Four-byte
922 *dst++ = src[1];
923 *dst++ = src[2];
924 *dst++ = src[3];
925 c = (c << 24) | (src[1] << 16) | (src[2] << 8) | src[3];
926 incr = 4;
928 src += incr;
929 word_length_bytes += incr;
931 int p = tbl[local_hash]; // Prediction
932 tbl[local_hash] = c; // Update prediction
933 if (c == p) {
934 good_predict_bytes += incr; // Count good predictions
937 local_hash = ((local_hash << 4) ^ c) & 0xfff;
940 *hash = local_hash;
942 if ((dst - isrc) < (srclen - 3)) {
943 // Pad and make last char clean UTF-8 by putting following spaces
944 dst[0] = ' ';
945 dst[1] = ' ';
946 dst[2] = ' ';
947 dst[3] = '\0';
948 } else if ((dst - isrc) < srclen) {
949 // Make last char clean UTF-8 by putting following space off the end
950 dst[0] = ' ';
953 return static_cast<int>(dst - isrc);
957 // Remove portions of text that have a high density of spaces, or that are
958 // overly repetitive, squeezing the remaining text in-place to the front of the
959 // input buffer.
961 // Squeezing looks at density of space/prediced chars in fixed-size chunks,
962 // specified by chunksize. A chunksize <= 0 uses the default size of 48 bytes.
964 // Return the new, possibly-shorter length
966 // Result Buffer ALWAYS has leading space and trailing space space space NUL,
967 // if input does
969 int CompactLangDetImpl::CheapSqueezeInplace(char* isrc,
970 int srclen,
971 int ichunksize) {
972 char* src = isrc;
973 char* dst = src;
974 char* srclimit = src + srclen;
975 bool skipping = false;
977 int hash = 0;
978 // Allocate local prediction table.
979 int* predict_tbl = new int[kPredictionTableSize];
980 memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0]));
982 int chunksize = ichunksize;
983 if (chunksize == 0) {chunksize = kChunksizeDefault;}
984 int space_thresh = (chunksize * kSpacesThreshPercent) / 100;
985 int predict_thresh = (chunksize * kPredictThreshPercent) / 100;
987 while (src < srclimit) {
988 int remaining_bytes = srclimit - src;
989 int len = cld::minint(chunksize, remaining_bytes);
990 // Make len land us on a UTF-8 character boundary, and also fix
991 // mispredictions because we could get out of phase.
992 // Loop always terminates at trailing space in buffer.
993 while ((src[len] & 0xc0) == 0x80)
994 ++len; // Move past continuation bytes
996 int space_n = CountSpaces4(src, len);
997 int predb_n = CountPredictedBytes(src, len, &hash, predict_tbl);
998 if ((space_n >= space_thresh) || (predb_n >= predict_thresh)) {
999 // Skip the text
1000 if (!skipping) {
1001 // Keeping-to-skipping transition; do it at a space
1002 int n = BackscanToSpace(dst, static_cast<int>(dst - isrc));
1003 dst -= n;
1004 skipping = true;
1005 if (FLAGS_cld_showme) {
1006 // Mark the deletion point with black square U+25A0
1007 *dst++ = 0xe2;
1008 *dst++ = 0x96;
1009 *dst++ = 0xa0;
1010 *dst++ = ' ';
1012 if (dst == isrc) {
1013 // Force a leading space if the first chunk is deleted
1014 *dst++ = ' ';
1017 } else {
1018 // Keep the text
1019 if (skipping) {
1020 // Skipping-to-keeping transition; do it at a space
1021 int n = ForwardscanToSpace(src, len);
1022 src += n;
1023 remaining_bytes -= n; // Shrink remaining length
1024 len -= n;
1025 skipping = false;
1027 // "len" can be negative in some cases
1028 if (len > 0) {
1029 memmove(dst, src, len);
1030 dst += len;
1033 src += len;
1036 if ((dst - isrc) < (srclen - 3)) {
1037 // Pad and make last char clean UTF-8 by putting following spaces
1038 dst[0] = ' ';
1039 dst[1] = ' ';
1040 dst[2] = ' ';
1041 dst[3] = '\0';
1042 } else if ((dst - isrc) < srclen) {
1043 // Make last char clean UTF-8 by putting following space off the end
1044 dst[0] = ' ';
1047 // Deallocate local prediction table
1048 delete[] predict_tbl;
1049 return static_cast<int>(dst - isrc);
1052 // Timing 2.8GHz P4 (dsites 2008.03.20) with 170KB input
1053 // About 90 MB/sec, with or without memcpy, chunksize 48 or 4096
1054 // Just CountSpaces is about 340 MB/sec
1055 // Byte-only CountPredictedBytes is about 150 MB/sec
1056 // Byte-only CountPredictedBytes, conditional tbl[] = is about 85! MB/sec
1057 // Byte-only CountPredictedBytes is about 180 MB/sec, byte tbl, byte/int c
1058 // Unjammed byte-only both = 170 MB/sec
1059 // Jammed byte-only both = 120 MB/sec
1060 // Back to original w/slight updates, 110 MB/sec
1062 bool CheapSqueezeTriggerTest(const char* src, int srclen, int testsize) {
1063 // Don't trigger at all on short text
1064 if (srclen < testsize) {return false;}
1065 int space_thresh = (testsize * kSpacesTriggerPercent) / 100;
1066 int predict_thresh = (testsize * kPredictTriggerPercent) / 100;
1067 int hash = 0;
1068 // Allocate local prediction table.
1069 int* predict_tbl = new int[kPredictionTableSize];
1070 memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0]));
1072 bool retval = false;
1073 if ((CountSpaces4(src, testsize) >= space_thresh) ||
1074 (CountPredictedBytes(src, testsize, &hash, predict_tbl) >=
1075 predict_thresh)) {
1076 retval = true;
1078 // Deallocate local prediction table
1079 delete[] predict_tbl;
1080 return retval;
1085 // Close pairs (correlation) language_enum/language_enum
1086 // id/ms (0.47) 38/40 [1]
1087 // bo/dz (0.46) 105/135 [2]
1088 // cz/sk (0.43) 17/68 [3]
1089 // no/nn (0.42) 10/80 [4]
1090 // hi/mr (0.38) 35/64 [5]
1091 // xh/zu (0.37) 83/84 [6]
1092 // Subscripted by packed language, gives 0 or a subscript in closepair
1093 // scoring array inside doc_tote
1094 static const uint8 kClosePair[EXT_NUM_LANGUAGES + 1] = {
1096 0,0,0,0,0,0,0,0, 0,0,4,0,0,0,0,0, 0,3,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1097 0,0,0,5,0,0,1,0, 1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1098 5,0,0,0,3,0,0,0, 0,0,0,0,0,0,0,0, 4,0,0,6,6,0,0,0, 0,0,0,0,0,0,0,0,
1099 0,0,0,0,0,0,0,0, 0,2,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1100 0,0,0,0,0,0,0,2, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1101 // Add new language close-pair number just before here (just use 0)
1105 // Delete any extended languages from doc_tote
1106 void RemoveExtendedLanguages(ToteWithReliability* doc_tote) {
1107 for (int sub = 0; sub < doc_tote->MaxSize(); ++sub) {
1108 if (cld::UnpackLanguage(doc_tote->Key(sub)) >= NUM_LANGUAGES) {
1109 // Effectively remove the extended language by setting key&score to zero
1110 if (FLAGS_dbgscore) {
1111 fprintf(stderr, "{-%s} ",
1112 ExtLanguageCode(cld::UnpackLanguage(doc_tote->Key(sub))));
1115 // Delete entry
1116 doc_tote->SetKey(sub, 0);
1117 doc_tote->SetValue(sub, 0);
1118 doc_tote->SetReliability(sub, 0);
1123 static const int kMinReliableKeepPercent = 41; // Remove lang if reli < this
1125 // For Tier3 languages, require a minimum number of bytes to be first-place lang
1126 static const int kGoodFirstT3MinBytes = 24; // <this => no first
1128 // Move bytes for unreliable langs to another lang or UNKNOWN
1129 // doc_tote is sorted, so cannot Add
1131 // If both CHINESE and CHINESET are present and unreliable, do not delete both;
1132 // merge both into CHINESE.
1134 //dsites 2009.03.19
1135 // we also want to remove Tier3 languages as the first lang if there is very
1136 // little text like ej1 ej2 ej3 ej4
1137 // maybe fold this back in earlier
1139 void RemoveUnreliableLanguages(ToteWithReliability* doc_tote) {
1140 // Prepass to merge some low-reliablility languages
1141 int total_bytes = 0;
1142 for (int sub = 0; sub < doc_tote->MaxSize(); ++sub) {
1143 int plang = doc_tote->Key(sub);
1144 if (plang == 0) {continue;} // Empty slot
1146 Language lang = cld::UnpackLanguage(plang);
1147 int bytes = doc_tote->Value(sub);
1148 int reli = doc_tote->Reliability(sub);
1149 if (bytes == 0) {continue;} // Zero bytes
1150 total_bytes += bytes;
1152 // Reliable percent is stored reliable score over stored bytecount
1153 int reliable_percent = reli / bytes;
1154 if (reliable_percent >= kMinReliableKeepPercent) {continue;} // Keeper
1156 // This language is too unreliable to keep, but we might merge it.
1157 Language altlang = UNKNOWN_LANGUAGE;
1158 if (lang < NUM_LANGUAGES) {altlang = kClosestAltLanguage[lang];}
1159 if (altlang == UNKNOWN_LANGUAGE) {continue;} // No alternative
1161 // Look for alternative in doc_tote
1162 int altsub = doc_tote->Find(cld::PackLanguage(altlang));
1163 if (altsub < 0) {continue;} // No alternative text
1165 int bytes2 = doc_tote->Value(altsub);
1166 int reli2 = doc_tote->Reliability(altsub);
1167 if (bytes2 == 0) {continue;} // Zero bytes
1169 // Reliable percent is stored reliable score over stored bytecount
1170 int reliable_percent2 = reli2 / bytes2;
1172 // Merge one language into the other. Break ties toward lower lang #
1173 int tosub = altsub;
1174 int fromsub = sub;
1175 bool into_lang = false;
1176 if ((reliable_percent2 < reliable_percent) ||
1177 ((reliable_percent2 == reliable_percent) && (lang < altlang))) {
1178 tosub = sub;
1179 fromsub = altsub;
1180 into_lang = true;
1183 // Make sure reliability doesn't drop and is enough to avoid delete
1184 int newpercent = cld::maxint(reliable_percent, reliable_percent2);
1185 newpercent = cld::maxint(newpercent, kMinReliableKeepPercent);
1186 int newbytes = bytes + bytes2;
1187 int newreli = newpercent * newbytes;
1189 doc_tote->SetKey(fromsub, 0);
1190 doc_tote->SetValue(fromsub, 0);
1191 doc_tote->SetReliability(fromsub, 0);
1192 doc_tote->SetValue(tosub, newbytes);
1193 doc_tote->SetReliability(tosub, newreli);
1195 // Show fate of unreliable languages if at least 10 bytes
1196 if (FLAGS_cld_html /*&& (newpercent >= 10)*/ && (newbytes >= 10)) {
1197 if (into_lang) {
1198 fprintf(stderr, "{Unreli %s.%d(%dB) => %s} ",
1199 ExtLanguageCode(altlang), reliable_percent2, bytes2,
1200 ExtLanguageCode(lang));
1201 } else {
1202 fprintf(stderr, "{Unreli %s.%d(%dB) => %s} ",
1203 ExtLanguageCode(lang), reliable_percent, bytes,
1204 ExtLanguageCode(altlang));
1210 // Pass to delete any remaining unreliable languages
1211 for (int sub = 0; sub < doc_tote->MaxSize(); ++sub) {
1212 int plang = doc_tote->Key(sub);
1213 if (plang == 0) {continue;} // Empty slot
1215 Language lang = cld::UnpackLanguage(plang);
1216 int bytes = doc_tote->Value(sub);
1217 int reli = doc_tote->Reliability(sub);
1218 if (bytes == 0) {continue;} // Zero bytes
1220 bool is_tier3 = (cld::kIsPackedTop40[plang] == 0);
1221 if (is_tier3 &&
1222 (bytes < kGoodFirstT3MinBytes) &&
1223 (bytes < total_bytes)) {
1224 reli = 0; // Too-short tier3
1227 // Reliable percent is stored as reliable score over stored bytecount
1228 int reliable_percent = reli / bytes;
1229 if (reliable_percent >= kMinReliableKeepPercent) {continue;} // Keeper
1231 // Delete unreliable entry
1232 doc_tote->SetKey(sub, 0);
1233 doc_tote->SetValue(sub, 0);
1234 doc_tote->SetReliability(sub, 0);
1236 // Show fate of unreliable languages if at least 10 bytes
1237 if (FLAGS_cld_html /*&& (reliable_percent >= 10)*/ && (bytes >= 10)) {
1238 fprintf(stderr, "{Unreli %s.%d(%dB)} ",
1239 ExtLanguageCode(lang), reliable_percent, bytes);
1243 if (FLAGS_cld_html) {fprintf(stderr, "<br>\n");}
1247 // Move less likely byte count to more likely for close pairs of languages
1248 void RefineScoredClosePairs(ToteWithReliability* doc_tote) {
1249 for (int sub = 0; sub < doc_tote->MaxSize(); ++sub) {
1250 int close_packedlang = doc_tote->Key(sub);
1251 int subscr = kClosePair[close_packedlang];
1252 if (subscr == 0) {continue;}
1254 // We have a close pair language -- if the other one is also scored and the
1255 // longword score differs enough, put all our eggs into one basket
1257 // Nonzero longword score: Go look for the other of this pair
1258 for (int sub2 = sub + 1; sub2 < doc_tote->MaxSize(); ++sub2) {
1259 if (kClosePair[doc_tote->Key(sub2)] == subscr) {
1260 // We have a matching pair
1261 int close_packedlang2 = doc_tote->Key(sub2);
1263 // Move all the text bytes from lower byte-count to higher one
1264 int from_sub, to_sub;
1265 Language from_lang, to_lang;
1266 if (doc_tote->Value(sub) < doc_tote->Value(sub2)) {
1267 from_sub = sub;
1268 to_sub = sub2;
1269 from_lang = cld::UnpackLanguage(close_packedlang);
1270 to_lang = cld::UnpackLanguage(close_packedlang2);
1271 } else {
1272 from_sub = sub2;
1273 to_sub = sub;
1274 from_lang = cld::UnpackLanguage(close_packedlang2);
1275 to_lang = cld::UnpackLanguage(close_packedlang);
1278 // Move all the bytes smaller => larger of the pair
1279 if (FLAGS_cld_html || FLAGS_dbgscore) {
1280 // Show fate of closepair language
1281 int val = doc_tote->Value(from_sub);
1282 int reli = doc_tote->Reliability(from_sub);
1283 int reliable_percent = reli / (val ? val : 1); // avoid zdiv
1284 fprintf(stderr, "{CloseLangPair: %s.%d%%(%dB) => %s} ",
1285 ExtLanguageCode(from_lang),
1286 reliable_percent,
1287 doc_tote->Value(from_sub),
1288 ExtLanguageCode(to_lang));
1290 int sum = doc_tote->Value(to_sub) + doc_tote->Value(from_sub);
1291 doc_tote->SetValue(to_sub, sum);
1292 doc_tote->SetReliability(to_sub, 100 * sum);
1294 // Delete old entry
1295 doc_tote->SetKey(from_sub, 0);
1296 doc_tote->SetValue(from_sub, 0);
1297 doc_tote->SetReliability(from_sub, 0);
1299 break; // Exit inner for sub2 loop
1301 } // End for sub2
1302 } // End for sub
1306 void ApplyLanguageHints(Tote* chunk_tote, int tote_grams,
1307 uint8* lang_hint_boost) {
1308 // Need 8 quad/unigrams to give full hint boost, else derate linearly
1309 if (tote_grams > 8) {
1310 tote_grams = 8;
1312 for (int sub = 0; sub < chunk_tote->MaxSize(); ++sub) {
1313 // Hint boosts are per packed subscript
1314 int lang_sub = chunk_tote->Key(sub);
1315 int new_value = chunk_tote->Value(sub) +
1316 ((lang_hint_boost[lang_sub] * tote_grams) >> 3);
1317 chunk_tote->SetValue(sub, new_value);
1318 if (FLAGS_dbgscore && (lang_hint_boost[lang_sub] > 0)) {
1319 fprintf(stderr, "[%s+=%d*%d/8] ",
1320 ExtLanguageCode(cld::UnpackLanguage(lang_sub)),
1321 lang_hint_boost[lang_sub], tote_grams);
1327 void PrintHtmlEscapedText(FILE* f, const char* txt, int len) {
1328 for (int i = 0; i < len; ++i) {
1329 char c = txt[i];
1330 if (c == '<') {
1331 fprintf(f, "&lt;");
1332 } else if (c == '>') {
1333 fprintf(f, "&gt;");
1334 } else if (c == '&') {
1335 fprintf(f, "&amp;");
1336 } else if (c == '\'') {
1337 fprintf(f, "&apos;");
1338 } else if (c == '"') {
1339 fprintf(f, "&quot;");
1340 } else {
1341 fprintf(f, "%c", c);
1344 fprintf(f, "<br>\n");
1348 // Add one chunk's score to running document score
1349 // If the top language is UNKNOWN_LANGUAGE, score nothing. This is used to
1350 // positively identify text to be ignored, such as link farms.
1351 // Sort before scoring and reinit afterward
1353 // src and srclen are just for debug output
1354 void ScoreChunkIntoDoc(const char* src, int srclen, int advance_by,
1355 UnicodeLScript lscript,
1356 Tote* chunk_tote,
1357 ToteWithReliability* doc_tote,
1358 int tote_grams,
1359 uint8* lang_hint_boost) {
1360 // Apply hints before sorting
1361 if (lang_hint_boost) {
1362 ApplyLanguageHints(chunk_tote, tote_grams, lang_hint_boost);
1365 // Sort to get top two languages
1366 chunk_tote->Sort(2);
1367 Language cur_lang = cld::UnpackLanguage(chunk_tote->Key(0));
1369 // Return if empty
1370 if (cur_lang < 0) {
1371 chunk_tote->Reinit();
1372 return;
1375 bool cur_unreliable = false;
1377 // Reliability is a function of mean script score per KB of text
1378 int len = chunk_tote->GetByteCount();
1379 int reliability = cld::GetReliability((len * 2) / advance_by,
1380 lscript,
1381 chunk_tote);
1382 cur_unreliable = (reliability < cld::kMinReliable);
1384 // If tote_grams=0, always reliable
1385 // If tote_grams=1, always unreliable
1386 if (tote_grams == 0) {
1387 reliability = 100;
1388 cur_unreliable = false;
1389 } else if (tote_grams == 1) {
1390 reliability = 0;
1391 cur_unreliable = true;
1394 #if 0
1395 // TEMP
1396 if (FLAGS_cld_html) {
1397 if (reliability >= kMinReliableKeepPercent) {
1398 fprintf(stderr, "R%d%% ", reliability);
1399 } else {
1400 fprintf(stderr, "--R%d%% ", reliability);
1403 #endif
1405 // Track the sequence of language fragments [result currently unused]
1406 ////if (reliability >= kMinReliableSeq) {
1407 //// doc_tote->AddSeq(chunk_tote->Key(0));
1408 ////}
1410 if (cur_unreliable && (chunk_tote->Key(1) != 0)) {
1411 // Unreliable and two top contenders, split byte count 5/8 - 3/8
1412 int top_len = ((len * 5) + 4) >> 3;
1413 int second_len = len - top_len;
1415 doc_tote->Add(chunk_tote->Key(0),
1416 top_len, chunk_tote->Value(0), reliability);
1417 doc_tote->Add(chunk_tote->Key(1),
1418 second_len, chunk_tote->Value(1), reliability);
1419 if (FLAGS_dbgscore) {
1420 fprintf(stderr, "{+%s.%d.%dR(%dB) +%s.%d.%dR(%dB)} ",
1421 ExtLanguageCode(cld::UnpackLanguage(chunk_tote->Key(0))),
1422 chunk_tote->Value(0),
1423 reliability,
1424 top_len,
1425 ExtLanguageCode(cld::UnpackLanguage(chunk_tote->Key(1))),
1426 chunk_tote->Value(1),
1427 reliability,
1428 second_len);
1430 } else {
1431 // Reliable or single contender
1432 doc_tote->Add(chunk_tote->Key(0),
1433 len, chunk_tote->Value(0), reliability);
1434 if (FLAGS_dbgscore) {
1435 fprintf(stderr, "{+%s.%d.%dR(%dB)} ",
1436 ExtLanguageCode(cld::UnpackLanguage(chunk_tote->Key(0))),
1437 chunk_tote->Value(0),
1438 reliability,
1439 len);
1443 if (FLAGS_cld_html) {
1444 if (cur_lang < 0) {cur_lang = UNKNOWN_LANGUAGE;}
1445 cld::PrintLang(stderr, chunk_tote,
1446 cur_lang, cur_unreliable,
1447 prior_lang, prior_unreliable);
1448 prior_lang = cur_lang;
1449 prior_unreliable = cur_unreliable;
1451 string temp(src, srclen);
1452 if (temp[0] == '=') {
1453 // Rewrite =ScriptX= or =SwitchX= as =Xxxx= for script code Xxxx
1454 temp = "=Buffered_";
1455 temp.append(UnicodeLScriptCode(lscript));
1456 temp.append("=");
1458 cld::PrintText(stderr, cur_lang, temp);
1461 chunk_tote->Reinit();
1465 void PrintTopLang(Language top_lang) {
1466 if ((top_lang == prior_lang) && (top_lang != UNKNOWN_LANGUAGE)) {
1467 fprintf(stderr, "[] ");
1468 } else {
1469 fprintf(stderr, "[%s] ", ExtLanguageName(top_lang));
1470 prior_lang = top_lang;
1474 void PrintTopLangSpeculative(Language top_lang) {
1475 fprintf(stderr, "<span style=\"color:#%06X;\">", 0xa0a0a0);
1476 if ((top_lang == prior_lang) && (top_lang != UNKNOWN_LANGUAGE)) {
1477 fprintf(stderr, "[] ");
1478 } else {
1479 fprintf(stderr, "[%s] ", ExtLanguageName(top_lang));
1480 prior_lang = top_lang;
1482 fprintf(stderr, "</span>\n");
1486 // Add one chunk's score to running document score
1487 // Convenience function with constant src text
1488 void ScoreChunkIntoDoc2(const char* src, int advance_by,
1489 UnicodeLScript lscript,
1490 Tote* chunk_tote,
1491 ToteWithReliability* doc_tote,
1492 int tote_grams,
1493 uint8* lang_hint_boost) {
1494 int srclen = static_cast<int>(strlen(src));
1495 ScoreChunkIntoDoc(src, srclen, advance_by, lscript, chunk_tote,
1496 doc_tote, tote_grams, lang_hint_boost);
1500 // Score one scriptspan using the only language for that script
1501 void ScoreNilgrams(getone::LangSpan* scriptspan, int lang,
1502 ToteWithReliability* doc_tote,
1503 uint8* lang_hint_boost,
1504 int flags, Language plus_one) {
1505 // For debugging only. Not thread-safe
1506 prior_lang = UNKNOWN_LANGUAGE;
1507 prior_unreliable = false;
1509 const char* src = scriptspan->text;
1510 int len = scriptspan->text_bytes;
1512 Tote chunk_tote;
1513 // Score 1000 for 1000 bytes
1514 chunk_tote.AddGram();
1515 chunk_tote.Add(lang, scriptspan->text_bytes);
1516 chunk_tote.AddBytes(scriptspan->text_bytes);
1517 int advance_by = 2;
1518 int tote_grams = 0; // Indicates fully reliable
1519 ScoreChunkIntoDoc(src, len, advance_by,
1520 scriptspan->script, &chunk_tote,
1521 doc_tote, tote_grams, lang_hint_boost);
1524 // Score one scriptspan using unigrams
1525 // Updates tote_grams
1526 static void ScoreUnigrams(const UTF8PropObj* unigram_obj,
1527 getone::LangSpan* scriptspan,
1528 int* tote_grams, int gram_limit,
1529 Tote* chunk_tote,
1530 ToteWithReliability* doc_tote,
1531 uint8* lang_hint_boost,
1532 int advance_by, int flags,
1533 int* initial_word_span, Language plus_one) {
1534 // chunk_tote may have partial sum coming in
1535 const char* src = scriptspan->text;
1536 const char* srclimit = src + scriptspan->text_bytes;
1538 // For debugging only. Not thread-safe
1539 prior_lang = UNKNOWN_LANGUAGE;
1540 prior_unreliable = false;
1542 // Break text up into multiple chunks and score each
1543 while (src < srclimit) {
1544 // Updates tote_grams
1545 int len = cld::DoUniScoreV3(unigram_obj,
1546 src, srclimit - src, advance_by,
1547 tote_grams, gram_limit, chunk_tote);
1548 if (FlagUseWords(flags) || (*initial_word_span > 0)) {
1549 // Use bigram scoring in addition to quadgrams
1550 cld::DoBigramScoreV3(&kCjkBiTable_obj,
1551 src, len, chunk_tote);
1553 chunk_tote->AddBytes(len);
1554 *initial_word_span -= len;
1556 if (*tote_grams >= gram_limit) {
1557 // Add this chunk to doc totals
1558 // Remove all but top40 if asked
1559 if (FlagTop40(flags)) {
1560 cld::DemoteNotTop40(chunk_tote, cld::PackLanguage(plus_one));
1563 // Sort, accumulate into doc total, reinit
1564 ScoreChunkIntoDoc(src, len, advance_by,
1565 scriptspan->script, chunk_tote,
1566 doc_tote, *tote_grams, lang_hint_boost);
1567 *tote_grams = 0;
1568 } else {
1569 if (FLAGS_cld_html) {
1570 string temp(src, len);
1571 Language top_lang = cld::UnpackLanguage(chunk_tote->CurrentTopKey());
1572 PrintTopLangSpeculative(top_lang);
1573 cld::PrintText(stderr, top_lang, temp);
1576 src += len;
1578 // chunk_tote may have partial sum going out
1581 // Back up one UTF-8 character
1582 const uint8* BackOneUTF8(const uint8* p) {
1583 const uint8* retval = p - 1;
1584 if ((*retval & 0xc0) == 0x80) {--retval;}
1585 if ((*retval & 0xc0) == 0x80) {--retval;}
1586 if ((*retval & 0xc0) == 0x80) {--retval;}
1587 return retval;
1591 // Score one scriptspan using quadgrams
1592 // Incoming chunk_tote may have partial accumulation
1593 static void ScoreQuadgrams(const cld::CLDTableSummary* quadgram_obj,
1594 getone::LangSpan* scriptspan,
1595 int* tote_grams, int gram_limit,
1596 Tote* chunk_tote,
1597 ToteWithReliability* doc_tote,
1598 uint8* lang_hint_boost,
1599 int advance_by, int flags,
1600 int* initial_word_span, Language plus_one) {
1601 // chunk_tote may have partial sum coming in
1602 const char* src = scriptspan->text;
1603 const char* srclimit = src + scriptspan->text_bytes;
1604 const char* lastscored_src = src;
1606 // For debugging only. Not thread-safe
1607 prior_lang = UNKNOWN_LANGUAGE;
1608 prior_unreliable = false;
1610 // Break text up into multiple chunks and score each
1611 while (src < srclimit) {
1612 // Updates tote_grams
1613 int len = cld::DoQuadScoreV3(quadgram_obj,
1614 src, srclimit - src, advance_by,
1615 tote_grams, gram_limit, chunk_tote);
1616 if (FlagUseWords(flags) || (*initial_word_span > 0)) {
1617 // Use word scoring in addition to quadgrams
1618 cld::DoOctaScoreV3(&kLongWord8Table_obj,
1619 src, len, chunk_tote);
1621 chunk_tote->AddBytes(len);
1622 *initial_word_span -= len;
1624 if (*tote_grams >= gram_limit) {
1625 // Remove all but top40 if asked
1626 if (FlagTop40(flags)) {
1627 cld::DemoteNotTop40(chunk_tote, cld::PackLanguage(plus_one));
1630 // Sort, accumulate into doc total, reinit
1631 ScoreChunkIntoDoc(src, len, advance_by,
1632 scriptspan->script, chunk_tote,
1633 doc_tote, *tote_grams, lang_hint_boost);
1634 lastscored_src = src + len;
1635 *tote_grams = 0;
1636 } else {
1637 if (FLAGS_cld_html) {
1638 string temp(src, len);
1639 Language top_lang = cld::UnpackLanguage(chunk_tote->CurrentTopKey());
1640 PrintTopLangSpeculative(top_lang);
1641 cld::PrintText(stderr, top_lang, temp);
1644 src += len;
1650 void PrintLangs(FILE* f, const Language* language3, const int* percent3,
1651 const int* text_bytes, const bool* is_reliable) {
1652 fprintf(f, "<br>&nbsp;&nbsp;Initial_Languages ");
1653 if (language3[0] != UNKNOWN_LANGUAGE) {
1654 fprintf(f, "%s%s(%d%%) ",
1655 ExtLanguageName(language3[0]),
1656 *is_reliable ? "" : "*",
1657 percent3[0]);
1659 if (language3[1] != UNKNOWN_LANGUAGE) {
1660 fprintf(f, "%s(%d%%) ", ExtLanguageName(language3[1]), percent3[1]);
1662 if (language3[2] != UNKNOWN_LANGUAGE) {
1663 fprintf(f, "%s(%d%%) ", ExtLanguageName(language3[2]), percent3[2]);
1665 fprintf(f, "%d bytes \n", *text_bytes);
1667 fprintf(f, "<br>\n");
1671 // Start the tote with a count of one for the default language for script
1672 void InitScriptToteLang(Tote* script_tote, UnicodeLScript lscript) {
1673 Language defaultlang = cld::kDefaultLanguagePerLScript[lscript];
1674 script_tote->Add(cld::PackLanguage(defaultlang), 1);
1675 script_tote->AddBytes(1);
1676 #if 0
1677 if (FLAGS_cld_html) {
1678 cld::PrintLang(stderr, script_tote,
1679 defaultlang, false,
1680 UNKNOWN_LANGUAGE, false);
1681 prior_lang = cur_lang;
1682 string temp("+1");
1683 cld::PrintText(stderr, defaultlang, temp);
1685 #endif
1688 static const char* const kToteName[4] =
1689 {"=Latn=", "=Hani=", "=Script2=", "=Script3="};
1690 static const char* const kToteSwitch[4] =
1691 {"=Latn=", "=Hani=", "=Switch2=", "=Switch3="};
1695 // Upper to lower, keep digits, everything else to minus '-' (2d)
1696 static const char kCharsetToLowerTbl[256] = {
1697 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
1698 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
1699 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
1700 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37, 0x38,0x39,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
1702 0x2d,0x61,0x62,0x63,0x64,0x65,0x66,0x67, 0x68,0x69,0x6a,0x6b,0x6c,0x6d,0x6e,0x6f,
1703 0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77, 0x78,0x79,0x7a,0x2d,0x2d,0x2d,0x2d,0x2d,
1704 0x2d,0x61,0x62,0x63,0x64,0x65,0x66,0x67, 0x68,0x69,0x6a,0x6b,0x6c,0x6d,0x6e,0x6f,
1705 0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77, 0x78,0x79,0x7a,0x2d,0x2d,0x2d,0x2d,0x2d,
1707 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
1708 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
1709 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
1710 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
1712 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
1713 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
1714 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
1715 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
1719 static const char kIsAlpha[256] = {
1720 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1721 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1722 0,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,0,0,0,0,0,
1723 0,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,0,0,0,0,0,
1725 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1726 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1727 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1728 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1731 static const char kIsDigit[256] = {
1732 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1733 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1, 1,1,0,0,0,0,0,0,
1734 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1735 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1737 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1738 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1739 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1740 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1743 // Normalize ASCII string to first 4 alphabetic/digit chars
1744 // Letters are forced to lowercase ASCII
1745 // Used to normalize TLD values
1746 void MakeChar4(const char* str, char* norm) {
1747 memcpy(norm, "____", 4); // four underscores
1748 int l_ptr = 0;
1749 for (int i = 0; i < strlen(str); ++i) {
1750 uint8 uc = static_cast<uint8>(str[i]);
1751 if (kIsAlpha[uc] | kIsDigit[uc]) {
1752 if (l_ptr < 4) { // Else ignore
1753 norm[l_ptr] = kCharsetToLowerTbl[uc];
1754 l_ptr++;
1760 // Find subscript of matching key in first 4 bytes of sorted hint array, or -1
1761 static int HintBinaryLookup4(const HintEntry* hintprobs, int hintprobssize,
1762 const char* norm_key) {
1763 // Key is always in range [lo..hi)
1764 int lo = 0;
1765 int hi = hintprobssize;
1766 while (lo < hi) {
1767 int mid = (lo + hi) >> 1;
1768 int comp = memcmp(&hintprobs[mid].key[0], norm_key, 4);
1769 if (comp < 0) {
1770 lo = mid + 1;
1771 } else if (comp > 0) {
1772 hi = mid;
1773 } else {
1774 return mid;
1777 return -1;
1781 // Increment the initial probabilities based on a per-TLD probs entry
1782 void ApplyTLDHint(uint8* lang_hint_boost, const char* tld_hint) {
1783 if (FLAGS_dbgscore) {
1784 fprintf(stderr, "TLD hint %s\n", tld_hint);
1786 char normalized_tld[8];
1787 MakeChar4(tld_hint, normalized_tld);
1788 int n = HintBinaryLookup4(kTLDHintProbs, kTLDHintProbsSize,
1789 normalized_tld);
1790 // TLD is four bytes, probability entry is 4 bytes
1791 if (n >= 0) {
1792 uint32 probs = kTLDHintProbs[n].probs;
1794 uint8 prob123 = (probs >> 0) & 0xff;
1795 const uint8* prob123_entry = cld::LgProb2TblEntry(prob123);
1796 uint8 top1 = (probs >> 8) & 0xff;
1797 if (top1 > 0) {lang_hint_boost[top1] += cld::LgProb3(prob123_entry, 0);}
1798 uint8 top2 = (probs >> 16) & 0xff;
1799 if (top2 > 0) {lang_hint_boost[top2] += cld::LgProb3(prob123_entry, 1);}
1800 uint8 top3 = (probs >> 24) & 0xff;
1801 if (top3 > 0) {lang_hint_boost[top3] += cld::LgProb3(prob123_entry, 2);}
1806 // Increment the initial probabilities based on a per-encoding probs entry
1807 void ApplyEncodingHint(uint8* lang_hint_boost, int encoding_hint) {
1808 if (FLAGS_dbgscore) {
1809 Encoding tempenc = static_cast<Encoding>(encoding_hint);
1810 fprintf(stderr, "ENC hint %s\n", EncodingName(tempenc));
1812 if (encoding_hint < ISO_8859_1) {return;}
1813 if (encoding_hint >= NUM_ENCODINGS) {return;}
1814 uint32 probs = kEncodingHintProbs[encoding_hint];
1816 uint8 prob123 = (probs >> 0) & 0xff;
1817 const uint8* prob123_entry = cld::LgProb2TblEntry(prob123);
1818 uint8 top1 = (probs >> 8) & 0xff;
1819 if (top1 > 0) {lang_hint_boost[top1] += cld::LgProb3(prob123_entry, 0);}
1820 uint8 top2 = (probs >> 16) & 0xff;
1821 if (top2 > 0) {lang_hint_boost[top2] += cld::LgProb3(prob123_entry, 1);}
1822 uint8 top3 = (probs >> 24) & 0xff;
1823 if (top3 > 0) {lang_hint_boost[top3] += cld::LgProb3(prob123_entry, 2);}
1827 // Increment the initial probability for given language by fixed amount
1828 // Does not recognize extended languages as hints
1829 void ApplyLanguageHint(uint8* lang_hint_boost, Language language_hint) {
1830 if (FLAGS_dbgscore) {
1831 fprintf(stderr, "LANG hint %s\n", ExtLanguageName(language_hint));
1833 if (language_hint < ENGLISH) {return;}
1834 if (language_hint >= NUM_LANGUAGES) {return;}
1835 uint32 probs = kLanguageHintProbs[language_hint];
1837 uint8 prob123 = (probs >> 0) & 0xff;
1838 const uint8* prob123_entry = cld::LgProb2TblEntry(prob123);
1839 uint8 top1 = (probs >> 8) & 0xff;
1840 if (top1 > 0) {lang_hint_boost[top1] += cld::LgProb3(prob123_entry, 0);}
1841 uint8 top2 = (probs >> 16) & 0xff;
1842 if (top2 > 0) {lang_hint_boost[top2] += cld::LgProb3(prob123_entry, 1);}
1843 uint8 top3 = (probs >> 24) & 0xff;
1844 if (top3 > 0) {lang_hint_boost[top3] += cld::LgProb3(prob123_entry, 2);}
1847 // Extract return values before fixups
1848 void ExtractLangEtc(ToteWithReliability* doc_tote, int total_text_bytes,
1849 int* reliable_percent3, Language* language3, int* percent3,
1850 double* normalized_score3,
1851 int* text_bytes, bool* is_reliable) {
1852 reliable_percent3[0] = 0;
1853 reliable_percent3[1] = 0;
1854 reliable_percent3[2] = 0;
1855 language3[0] = UNKNOWN_LANGUAGE;
1856 language3[1] = UNKNOWN_LANGUAGE;
1857 language3[2] = UNKNOWN_LANGUAGE;
1858 percent3[0] = 100;
1859 percent3[1] = 0;
1860 percent3[2] = 0;
1861 normalized_score3[0] = 0.0;
1862 normalized_score3[1] = 0.0;
1863 normalized_score3[2] = 0.0;
1865 *text_bytes = total_text_bytes;
1866 *is_reliable = false;
1868 int bytecount1 = total_text_bytes;
1869 int bytecount2 = 0;
1870 int bytecount3 = 0;
1872 int lang1 = doc_tote->Key(0);
1873 if (lang1 != 0) {
1874 // We have a top language
1875 language3[0] = cld::UnpackLanguage(lang1);
1876 bytecount1 = doc_tote->Value(0);
1877 int reli1 = doc_tote->Reliability(0);
1878 reliable_percent3[0] = reli1 / (bytecount1 ? bytecount1 : 1); // avoid zdiv
1879 normalized_score3[0] = cld::GetNormalizedScore(language3[0],
1880 ULScript_Common,
1881 bytecount1,
1882 doc_tote->Score(0));
1885 int lang2 = doc_tote->Key(1);
1886 if (lang2 != 0) {
1887 language3[1] = cld::UnpackLanguage(lang2);
1888 bytecount2 = doc_tote->Value(1);
1889 int reli2 = doc_tote->Reliability(1);
1890 reliable_percent3[1] = reli2 / (bytecount2 ? bytecount2 : 1); // avoid zdiv
1891 normalized_score3[1] = cld::GetNormalizedScore(language3[1],
1892 ULScript_Common,
1893 bytecount2,
1894 doc_tote->Score(1));
1897 int lang3 = doc_tote->Key(2);
1898 if (lang3 != 0) {
1899 language3[2] = cld::UnpackLanguage(lang3);
1900 bytecount3 = doc_tote->Value(2);
1901 int reli3 = doc_tote->Reliability(2);
1902 reliable_percent3[2] = reli3 / (bytecount3 ? bytecount3 : 1); // avoid zdiv
1903 normalized_score3[2] = cld::GetNormalizedScore(language3[2],
1904 ULScript_Common,
1905 bytecount3,
1906 doc_tote->Score(2));
1909 // Increase total bytes to sum (top 3) if low for some reason
1910 int total_bytecount12 = bytecount1 + bytecount2;
1911 int total_bytecount123 = total_bytecount12 + bytecount3;
1912 if (total_text_bytes < total_bytecount123) {
1913 total_text_bytes = total_bytecount123;
1914 *text_bytes = total_text_bytes;
1917 // Sum minus previous % gives better roundoff behavior than bytecount/total
1918 int total_text_bytes_div = cld::maxint(1, total_text_bytes); // Avoid zdiv
1919 percent3[0] = (bytecount1 * 100) / total_text_bytes_div;
1920 percent3[1] = (total_bytecount12 * 100) / total_text_bytes_div;
1921 percent3[2] = (total_bytecount123 * 100) / total_text_bytes_div;
1922 percent3[2] -= percent3[1];
1923 percent3[1] -= percent3[0];
1925 // Roundoff, say 96% 1.6% 1.4%, will produce non-obvious 96% 1% 2%
1926 // Fix this explicitly
1927 if (percent3[1] < percent3[2]) {
1928 ++percent3[1];
1929 --percent3[2];
1931 if (percent3[0] < percent3[1]) {
1932 ++percent3[0];
1933 --percent3[1];
1936 *text_bytes = total_text_bytes;
1938 if (lang1 != 0) {
1939 // We have a top language
1940 // Its reliability is overal result reliability
1941 int bytecount = doc_tote->Value(0);
1942 int reli = doc_tote->Reliability(0);
1943 int reliable_percent = reli / (bytecount ? bytecount : 1); // avoid zdiv
1944 *is_reliable = reliable_percent >= cld::kMinReliable;
1945 } else {
1946 // No top language at all. This can happen with zero text or 100% Klingon
1947 // if extended=false. Just return all UNKNOWN_LANGUAGE, reliable.
1948 *is_reliable = true;
1952 bool IsFIGS(Language lang) {
1953 if (lang == FRENCH) {return true;}
1954 if (lang == ITALIAN) {return true;}
1955 if (lang == GERMAN) {return true;}
1956 if (lang == SPANISH) {return true;}
1957 return false;
1960 bool IsEFIGS(Language lang) {
1961 if (lang == ENGLISH) {return true;}
1962 if (lang == FRENCH) {return true;}
1963 if (lang == ITALIAN) {return true;}
1964 if (lang == GERMAN) {return true;}
1965 if (lang == SPANISH) {return true;}
1966 return false;
1969 static const int kNonEnBoilerplateMinPercent = 17; // <this => no second
1970 static const int kNonFIGSBoilerplateMinPercent = 20; // <this => no second
1971 static const int kGoodFirstMinPercent = 26; // <this => UNK
1972 static const int kGoodFirstReliableMinPercent = 51; // <this => unreli
1973 static const int kIgnoreMaxPercent = 95; // >this => unreli
1974 static const int kKeepMinPercent = 2; // <this => unreli
1976 // For Tier3 languages, require more bytes of text to override
1977 // the first-place language
1978 static const int kGoodSecondT1T2MinBytes = 15; // <this => no second
1979 static const int kGoodSecondT3MinBytes = 128; // <this => no second
1982 // Calculate a single summary language for the document, and its reliability.
1983 // Returns language3[0] or language3[1] or ENGLISH or UNKNOWN_LANGUAGE
1984 // This is the heart of matching human-rater perception.
1985 // reliable_percent3[] is currently unused
1987 // Do not return Tier3 second language unless there are at least 128 bytes
1988 void CalcSummaryLang(ToteWithReliability* doc_tote, int total_text_bytes,
1989 const int* reliable_percent3,
1990 const Language* language3,
1991 const int* percent3,
1992 Language* summary_lang, bool* is_reliable) {
1993 // Vector of active languages; changes if we delete some
1994 int slot_count = 3;
1995 int active_slot[3] = {0, 1, 2};
1997 int ignore_percent = 0;
1998 int return_percent = percent3[0]; // Default to top lang
1999 *summary_lang = language3[0];
2000 *is_reliable = true;
2001 if (percent3[0] < kKeepMinPercent) {*is_reliable = false;}
2003 // If any of top 3 is IGNORE, remove it and increment ignore_percent
2004 for (int i = 0; i < 3; ++i) {
2005 if (language3[i] == TG_UNKNOWN_LANGUAGE) {
2006 ignore_percent += percent3[i];
2007 // Move the rest up, levaing input vectors unchanged
2008 for (int j=i+1; j < 3; ++j) {
2009 active_slot[j - 1] = active_slot[j];
2011 -- slot_count;
2012 // Logically remove Ignore from percentage-text calculation
2013 // (extra 1 in 101 avoids zdiv, biases slightly small)
2014 return_percent = (percent3[0] * 100) / (101 - ignore_percent);
2015 *summary_lang = language3[active_slot[0]];
2016 if (percent3[active_slot[0]] < kKeepMinPercent) {*is_reliable = false;}
2021 // If English and X, where X (not UNK) is big enough,
2022 // assume the English is boilerplate and return X.
2023 // Logically remove English from percentage-text calculation
2024 int second_bytes = (total_text_bytes * percent3[active_slot[1]]) / 100;
2025 // Require more bytes of text for Tier3 languages
2026 int minbytesneeded = kGoodSecondT1T2MinBytes;
2027 int plang_second = cld::PackLanguage(language3[active_slot[1]]);
2028 bool is_tier3 = (cld::kIsPackedTop40[plang_second] == 0);
2029 if (is_tier3) {
2030 minbytesneeded = kGoodSecondT3MinBytes;
2033 if ((language3[active_slot[0]] == ENGLISH) &&
2034 (language3[active_slot[1]] != ENGLISH) &&
2035 (language3[active_slot[1]] != UNKNOWN_LANGUAGE) &&
2036 (percent3[active_slot[1]] >= kNonEnBoilerplateMinPercent) &&
2037 (second_bytes >= minbytesneeded)) {
2038 ignore_percent += percent3[active_slot[0]];
2039 return_percent = (percent3[active_slot[1]] * 100) / (101 - ignore_percent);
2040 *summary_lang = language3[active_slot[1]];
2041 if (percent3[active_slot[1]] < kKeepMinPercent) {*is_reliable = false;}
2043 // Else If FIGS and X, where X (not UNK, EFIGS) is big enough,
2044 // assume the FIGS is boilerplate and return X.
2045 // Logically remove FIGS from percentage-text calculation
2046 } else if (IsFIGS(language3[active_slot[0]]) &&
2047 !IsEFIGS(language3[active_slot[1]]) &&
2048 (language3[active_slot[1]] != UNKNOWN_LANGUAGE) &&
2049 (percent3[active_slot[1]] >= kNonFIGSBoilerplateMinPercent) &&
2050 (second_bytes >= minbytesneeded)) {
2051 ignore_percent += percent3[active_slot[0]];
2052 return_percent = (percent3[active_slot[1]] * 100) / (101 - ignore_percent);
2053 *summary_lang = language3[active_slot[1]];
2054 if (percent3[active_slot[1]] < kKeepMinPercent) {*is_reliable = false;}
2056 // Else we are returning the first language, but want to improve its
2057 // return_percent if the second language should be ignored
2058 } else if ((language3[active_slot[1]] == ENGLISH) &&
2059 (language3[active_slot[0]] != ENGLISH)) {
2060 ignore_percent += percent3[active_slot[1]];
2061 return_percent = (percent3[active_slot[0]] * 100) / (101 - ignore_percent);
2062 } else if (IsFIGS(language3[active_slot[1]]) &&
2063 !IsEFIGS(language3[active_slot[0]])) {
2064 ignore_percent += percent3[active_slot[1]];
2065 return_percent = (percent3[active_slot[0]] * 100) / (101 - ignore_percent);
2068 // If return percent is too small (too many languages), return UNKNOWN
2069 if ((return_percent < kGoodFirstMinPercent)) {
2070 *summary_lang = UNKNOWN_LANGUAGE;
2071 *is_reliable = false;
2074 // If return percent is small, return language but set unreliable.
2075 if ((return_percent < kGoodFirstReliableMinPercent)) {
2076 *is_reliable = false;
2079 // If ignore percent is too large, set unreliable.
2080 if ((ignore_percent > kIgnoreMaxPercent)) {
2081 *is_reliable = false;
2084 // If we removed all the active languages, return UNKNOWN
2085 if (slot_count == 0) {
2086 *summary_lang = UNKNOWN_LANGUAGE;
2087 *is_reliable = false;
2093 // Result vector must be exactly three items
2094 Language CompactLangDetImpl::DetectLanguageSummaryV25(
2095 const CompactLangDet::DetectionTables* tables,
2096 const char* buffer,
2097 int buffer_length,
2098 bool is_plain_text,
2099 const char* tld_hint, // "id" boosts Indonesian
2100 int encoding_hint, // SJS boosts Japanese
2101 Language language_hint, // ITALIAN boosts it
2102 bool allow_extended_lang,
2103 int flags,
2104 Language plus_one,
2105 Language* language3,
2106 int* percent3,
2107 double* normalized_score3,
2108 int* text_bytes,
2109 bool* is_reliable) {
2110 if (!tables) {
2111 static const CompactLangDet::DetectionTables default_cld_tables = {
2112 &kQuadTable_obj,
2113 &compact_lang_det_generated_ctjkvz_b1_obj
2115 tables = &default_cld_tables;
2117 language3[0] = UNKNOWN_LANGUAGE;
2118 language3[1] = UNKNOWN_LANGUAGE;
2119 language3[2] = UNKNOWN_LANGUAGE;
2120 percent3[0] = 100;
2121 percent3[1] = 0;
2122 percent3[2] = 0;
2123 normalized_score3[0] = 0.0;
2124 normalized_score3[1] = 0.0;
2125 normalized_score3[2] = 0.0;
2126 *text_bytes = 0;
2127 *is_reliable = false;
2129 // Document totals
2130 ToteWithReliability doc_tote; // Reliability = 0..100
2132 // Vector of packed per-language boosts (just one filled in from hints)
2133 uint8 lang_hint_boost[EXT_NUM_LANGUAGES + 1];
2134 memset(lang_hint_boost, 0, sizeof(lang_hint_boost));
2136 // Apply hints,if any
2137 if ((tld_hint != NULL) && (tld_hint[0] != '\0')) {
2138 ApplyTLDHint(lang_hint_boost, tld_hint);
2140 if (encoding_hint != UNKNOWN_ENCODING) {
2141 ApplyEncodingHint(lang_hint_boost, encoding_hint);
2143 if (language_hint != UNKNOWN_LANGUAGE) {
2144 ApplyLanguageHint(lang_hint_boost, language_hint);
2148 // Four individual script totals, Latin, Han, other2, other3
2149 int next_other_tote = 2;
2151 // Four totes for up to four different scripts pending at once
2152 Tote totes[4]; // [0] Latn [1] Hani [2] other [3] other
2153 bool tote_seen[4] = {false, false, false, false};
2154 int tote_grams[4] = {0, 0, 0, 0}; // Number in partial chunk
2155 UnicodeLScript tote_script[4] =
2156 {ULScript_Latin, ULScript_HanCJK, ULScript_Common, ULScript_Common};
2158 // Loop through text spans in a single script
2159 ScriptScanner ss(buffer, buffer_length, is_plain_text);
2160 getone::LangSpan scriptspan;
2162 scriptspan.text = NULL;
2163 scriptspan.text_bytes = 0;
2164 scriptspan.offset = 0;
2165 scriptspan.script = ULScript_Common;
2166 scriptspan.lang = UNKNOWN_LANGUAGE;
2168 int total_text_bytes = 0;
2169 int textlimit = FLAGS_cld_textlimit << 10; // in KB
2170 if (textlimit == 0) {textlimit = 0x7fffffff;}
2172 int advance_by = 2; // Advance 2 bytes
2173 int advance_limit = textlimit >> 3; // For first 1/8 of max document
2175 int initial_word_span = kDefaultWordSpan;
2176 if (FLAGS_cld_forcewords) {
2177 initial_word_span = kReallyBigWordSpan;
2180 // Pick up chunk sizes
2181 // Smoothwidth is units of quadgrams, about 2.5 chars (unigrams) each
2182 // Sanity check -- force into a reasonable range
2183 int chunksizequads = FLAGS_cld_smoothwidth;
2184 chunksizequads = cld::minint(cld::maxint(chunksizequads, kMinChunkSizeQuads),
2185 kMaxChunkSizeQuads);
2186 int chunksizeunis = (chunksizequads * 5) >> 1;
2188 // Varying short-span limit doesn't work well -- skips too much beyond 20KB
2189 // int spantooshortlimit = advance_by * FLAGS_cld_smoothwidth;
2190 int spantooshortlimit = kShortSpanThresh;
2192 // For debugging only. Not thread-safe
2193 prior_lang = UNKNOWN_LANGUAGE;
2194 prior_unreliable = false;
2196 // Allocate full-document prediction table for finding repeating words
2197 int hash = 0;
2198 int* predict_tbl = new int[kPredictionTableSize];
2199 if (FlagRepeats(flags)) {
2200 memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0]));
2203 // Loop through scriptspans accumulating number of text bytes in each language
2204 while (ss.GetOneScriptSpanLower(&scriptspan)) {
2205 UnicodeLScript lscript = scriptspan.script;
2207 // Echo text if asked to
2208 if (FLAGS_cld_echotext) {
2209 PrintHtmlEscapedText(stderr, scriptspan.text, scriptspan.text_bytes);
2212 // Squeeze out big chunks of text span if asked to
2213 if (FlagSqueeze(flags)) {
2214 // Remove repetitive or mostly-spaces chunks
2215 int newlen;
2216 int chunksize = 0; // Use the default
2217 newlen = CheapSqueezeInplace(scriptspan.text, scriptspan.text_bytes,
2218 chunksize);
2219 scriptspan.text_bytes = newlen;
2220 } else {
2221 // Check now and then to see if we should be squeezing
2222 if ((total_text_bytes >= kCheapSqueezeTestThresh) &&
2223 !FlagFinish(flags) &&
2224 ((getone::kMaxScriptBuffer >> 1) < scriptspan.text_bytes) &&
2225 CheapSqueezeTriggerTest(scriptspan.text,
2226 scriptspan.text_bytes,
2227 kCheapSqueezeTestLen)) {
2228 // Recursive call with big-chunk squeezing set
2229 if (FLAGS_cld_html || FLAGS_dbgscore) {
2230 fprintf(stderr,
2231 "<br>---text_bytes[%d] Recursive(Squeeze)---<br><br>\n",
2232 total_text_bytes);
2234 // Deallocate full-document prediction table
2235 delete[] predict_tbl;
2237 return DetectLanguageSummaryV25(
2238 tables,
2239 buffer,
2240 buffer_length,
2241 is_plain_text,
2242 tld_hint, // "id" boosts Indonesian
2243 encoding_hint, // SJS boosts Japanese
2244 language_hint, // ITALIAN boosts it
2245 allow_extended_lang,
2246 flags | kCLDFlagSqueeze,
2247 plus_one,
2248 language3,
2249 percent3,
2250 normalized_score3,
2251 text_bytes,
2252 is_reliable);
2256 // Remove repetitive words if asked to
2257 if (FlagRepeats(flags)) {
2258 // Remove repetitive words
2259 int newlen;
2260 newlen = CheapRepWordsInplace(scriptspan.text, scriptspan.text_bytes,
2261 &hash, predict_tbl);
2262 scriptspan.text_bytes = newlen;
2265 // The real scoring
2266 // Accumulate directly into the document total, or accmulate in one of four
2267 // chunk totals. The purpose of the multiple chunk totals is to piece
2268 // together short choppy pieces of text in alternating scripts. One total is
2269 // dedicated to Latin text, one to Han text, and the other two are dynamicly
2270 // assigned.
2271 Language onlylang = cld::kOnlyLanguagePerLScript[lscript];
2273 if (onlylang != UNKNOWN_LANGUAGE) {
2274 // This entire script run is in a single language.
2275 ScoreNilgrams(&scriptspan, cld::PackLanguage(onlylang), &doc_tote,
2276 lang_hint_boost, flags, plus_one);
2277 } else if (cld::kScoreUniPerLScript[lscript] != 0) {
2278 // This entire script run's languages can be distinguished by uni-grams
2279 // Accumulate in hani_tote
2280 int tote_num = 1;
2281 if (!tote_seen[tote_num]) {
2282 tote_seen[tote_num] = true;
2283 // Default language gets 1 byte
2284 total_text_bytes += 1;
2285 InitScriptToteLang(&totes[tote_num], lscript);
2287 ScoreUnigrams(tables->unigram_obj,
2288 &scriptspan, &tote_grams[tote_num], chunksizeunis,
2289 &totes[tote_num],
2290 &doc_tote, lang_hint_boost,
2291 advance_by, flags, &initial_word_span, plus_one);
2292 } else {
2293 // This entire script-run's languages can be distinguished by quad-grams
2294 // Accumulate in latn_tote or script0/1_tote
2295 int tote_num = -1;
2296 for (int t = 0; t < 4; ++t) {
2297 if (lscript == tote_script[t]) {
2298 tote_num = t;
2299 break;
2302 if (tote_num < 0) {
2303 // Need to allocate other0/1
2304 tote_num = next_other_tote;
2305 next_other_tote ^= 1; // Round-robin
2306 if (tote_seen[tote_num]) {
2307 // Flush previous
2308 ScoreChunkIntoDoc2(kToteSwitch[tote_num], advance_by,
2309 tote_script[tote_num], &totes[tote_num],
2310 &doc_tote, tote_grams[tote_num], lang_hint_boost);
2311 totes[tote_num].Reinit();
2313 tote_script[tote_num] = lscript;
2316 if (!tote_seen[tote_num]) {
2317 tote_seen[tote_num] = true;
2318 // Default language gets 1 byte
2319 total_text_bytes += 1;
2320 InitScriptToteLang(&totes[tote_num], lscript);
2323 // The actual accumulation, possibly with word scoring also
2324 ScoreQuadgrams(tables->quadgram_obj, &scriptspan, &tote_grams[tote_num],
2325 chunksizequads,
2326 &totes[tote_num],
2327 &doc_tote, lang_hint_boost,
2328 advance_by, flags, &initial_word_span, plus_one);
2331 total_text_bytes += scriptspan.text_bytes;
2333 // For long documents, do less-dense samples the further along we go.
2334 // This is to keep speed sublinear in document size.
2335 if (total_text_bytes > advance_limit) {
2336 if (total_text_bytes > textlimit) {
2337 // Don't look at rest of doc
2338 if (FLAGS_cld_html || FLAGS_dbgscore) {
2339 fprintf(stderr, "<br>---text_bytes[%d] textlimit %d reached---<br>",
2340 total_text_bytes, textlimit);
2342 break;
2344 advance_by <<= 1; // Double advance bytes
2345 advance_limit <<= 1; // Double limit until next change
2346 spantooshortlimit <<= 1; // Double short-span size
2347 if (FLAGS_cld_html || FLAGS_dbgscore) {
2348 fprintf(stderr, "<br>---text_bytes[%d] advance_by doubled to %d---<br>",
2349 total_text_bytes, advance_by);
2352 } // End while (ss.GetOneScriptSpanLower())
2354 // Deallocate full-document prediction table
2355 delete[] predict_tbl;
2357 // Flush pending totals
2358 for (int tote_num = 0; tote_num < 4; ++tote_num) {
2359 if (tote_seen[tote_num]) {
2360 ScoreChunkIntoDoc2(kToteName[tote_num], advance_by,
2361 tote_script[tote_num], &totes[tote_num], &doc_tote,
2362 tote_grams[tote_num], lang_hint_boost);
2366 // If extended langauges are disallowed, remove them here
2367 if (!allow_extended_lang) {
2368 RemoveExtendedLanguages(&doc_tote);
2371 // Force close pairs to one or the other
2372 RefineScoredClosePairs(&doc_tote);
2375 // Calculate return results
2376 // Find top three byte counts in tote heap
2377 int reliable_percent3[3];
2380 // Cannot use Add, etc. after sorting
2381 doc_tote.Sort(3);
2383 ExtractLangEtc(&doc_tote, total_text_bytes,
2384 reliable_percent3, language3, percent3, normalized_score3,
2385 text_bytes, is_reliable);
2387 bool have_good_answer = false;
2388 if (FlagFinish(flags)) {
2389 // Force a result
2390 have_good_answer = true;
2391 } else if (total_text_bytes <= kShortTextThresh) {
2392 // Don't recurse on short text -- we already did word scores
2393 have_good_answer = true;
2394 } else if (*is_reliable &&
2395 (percent3[0] >= kGoodLang1Percent)) {
2396 have_good_answer = true;
2397 } else if (*is_reliable &&
2398 ((percent3[0] + percent3[1]) >= kGoodLang1and2Percent)) {
2399 have_good_answer = true;
2403 if (have_good_answer) {
2404 // This is the real, non-recursive return
2406 // Move bytes for unreliable langs to another lang or UNKNOWN
2407 RemoveUnreliableLanguages(&doc_tote);
2409 // Redo the result extraction after the removal above
2410 doc_tote.Sort(3);
2411 ExtractLangEtc(&doc_tote, total_text_bytes,
2412 reliable_percent3, language3, percent3, normalized_score3,
2413 text_bytes, is_reliable);
2415 #if 0
2416 // OLD code, replaced by CalcSummaryLang
2418 // Suppress ignore-me text, TG_UNKNOWN_LANGUAGE if 2nd or 3rd language
2419 // Force it to English if first language
2420 if (language3[2] == TG_UNKNOWN_LANGUAGE) {
2421 reliable_percent3[2] = 0;
2422 language3[2] = UNKNOWN_LANGUAGE;
2423 percent3[2] = 0;
2424 } else if (language3[1] == TG_UNKNOWN_LANGUAGE) {
2425 // Move up lower language
2426 reliable_percent3[1] = reliable_percent3[2];
2427 language3[1] = language3[2];
2428 percent3[1] = percent3[2];
2429 reliable_percent3[2] = 0;
2430 language3[2] = UNKNOWN_LANGUAGE;
2431 percent3[2] = 0;
2432 } else if (language3[0] == TG_UNKNOWN_LANGUAGE) {
2433 language3[0] = ENGLISH;
2436 if (language3[0] == UNKNOWN_LANGUAGE) {
2437 // Last-ditch test for some result, but it is UNKNOWN_LANGUAGE
2438 // Force it to English (should not happen)
2439 language3[0] = ENGLISH;
2440 percent3[0] = 100;
2441 *is_reliable = true;
2443 #endif
2446 #if 0
2447 // Scaffolding to reveal subset sequence lang distribution across doc text
2448 // Track the sequence of language fragments [result currently unused]
2449 if (FLAGS_cld_html) {
2450 static const int kMaxSubsetSeq = 12;
2451 uint8 subseq[kMaxSubsetSeq];
2452 doc_tote.ExtractSeq(kMaxSubsetSeq, subseq);
2454 fprintf(stderr, "<br>\nSubset Sequence[%d]: ", kMaxSubsetSeq);
2455 for (int i = 0; i < kMaxSubsetSeq; ++i) {
2456 fprintf(stderr, "%s ", ExtLanguageCode(cld::UnpackLanguage(subseq[i])));
2457 if ((i % 4) == 3) {fprintf(stderr, "&nbsp; ");}
2459 fprintf(stderr, "&nbsp;&nbsp; ");
2461 for (int i = 0; i < 3; ++i) {
2462 if (language3[i] != UNKNOWN_LANGUAGE) {
2463 fprintf(stderr, "%s.%d(%d%%) ",
2464 ExtLanguageCode(language3[i]),
2465 reliable_percent3[i],
2466 percent3[i]);
2470 fprintf(stderr, "%d B ", total_text_bytes);
2471 fprintf(stderr, "<br>\n");
2473 // End Scaffolding to reveal subset sequence lang distribution
2474 #endif
2476 Language summary_lang;
2477 CalcSummaryLang(&doc_tote, total_text_bytes,
2478 reliable_percent3, language3, percent3,
2479 &summary_lang, is_reliable);
2481 if (FLAGS_cld_html) {
2482 for (int i = 0; i < 3; ++i) {
2483 if (language3[i] != UNKNOWN_LANGUAGE) {
2484 fprintf(stderr, "%s.%d(%d%%) ",
2485 ExtLanguageCode(language3[i]),
2486 reliable_percent3[i],
2487 percent3[i]);
2491 fprintf(stderr, "%d B ", total_text_bytes);
2492 fprintf(stderr, "= %s%c ",
2493 ExtLanguageName(summary_lang), is_reliable ? ' ' : '*');
2494 fprintf(stderr, "<br>\n");
2497 return summary_lang;
2500 // Not a good answer -- do recursive call to refine
2501 if (FLAGS_cld_html || FLAGS_dbgscore) {
2502 // This is what we hope to improve on in the recursive call, if any
2503 PrintLangs(stderr, language3, percent3, text_bytes, is_reliable);
2506 // For restriction to Top40 + one, the one is 1st/2nd lang that is not Top40
2507 // For this purpose, we treate "Ignore" as top40
2508 Language new_plus_one = UNKNOWN_LANGUAGE;
2509 if (cld::kIsPackedTop40[cld::PackLanguage(language3[0])] == 0) {
2510 new_plus_one = language3[0];
2511 } else if (cld::kIsPackedTop40[cld::PackLanguage(language3[1])] == 0) {
2512 new_plus_one = language3[1];
2515 if (total_text_bytes < kShortTextThresh) {
2516 // Short text: Recursive call with top40 and short set
2517 if (FLAGS_cld_html || FLAGS_dbgscore) {
2518 fprintf(stderr, "&nbsp;&nbsp;---text_bytes[%d] "
2519 "Recursive(Top40/Rep/Short/Words)---<br><br>\n",
2520 total_text_bytes);
2522 return DetectLanguageSummaryV25(
2523 tables,
2524 buffer,
2525 buffer_length,
2526 is_plain_text,
2527 tld_hint, // "id" boosts Indonesian
2528 encoding_hint, // SJS boosts Japanese
2529 language_hint, // ITALIAN boosts it
2530 allow_extended_lang,
2531 flags | kCLDFlagTop40 | kCLDFlagRepeats |
2532 kCLDFlagShort | kCLDFlagUseWords | kCLDFlagFinish,
2533 new_plus_one,
2534 language3,
2535 percent3,
2536 normalized_score3,
2537 text_bytes,
2538 is_reliable);
2541 // Longer text: Recursive call with top40 set
2542 if (FLAGS_cld_html || FLAGS_dbgscore) {
2543 fprintf(stderr,
2544 "&nbsp;&nbsp;---text_bytes[%d] Recursive(Top40/Rep)---<br><br>\n",
2545 total_text_bytes);
2547 return DetectLanguageSummaryV25(
2548 tables,
2549 buffer,
2550 buffer_length,
2551 is_plain_text,
2552 tld_hint, // "id" boosts Indonesian
2553 encoding_hint, // SJS boosts Japanese
2554 language_hint, // ITALIAN boosts it
2555 allow_extended_lang,
2556 flags | kCLDFlagTop40 | kCLDFlagRepeats |
2557 kCLDFlagFinish,
2558 new_plus_one,
2559 language3,
2560 percent3,
2561 normalized_score3,
2562 text_bytes,
2563 is_reliable);
2564 } // End CompactLangDetImpl::DetectLanguageSummaryV25