1 // Copyright (c) 2009 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 // This file extends lang_enc.cc with additional languages and extended routines
6 // It is current with Unicode 5.1 (beta Jan 2008)
13 #include "encodings/compact_lang_det/ext_lang_enc.h"
14 #include "encodings/compact_lang_det/win/cld_macros.h"
15 #include "encodings/compact_lang_det/win/cld_strtoint.h"
17 // Language names above NUM_LANGUAGES
18 // These are also the C enum declared names
19 static const char* const kExtLanguageName
[] = {
20 "X_BORK_BORK_BORK", "X_PIG_LATIN", "X_HACKER", "X_KLINGON", "X_ELMER_FUDD",
22 // Pseudo-languages for Unicode scripts that express a single language
23 "X_OGHAM", "X_RUNIC", "X_YI", "X_OLD_ITALIC", "X_GOTHIC",
24 "X_DESERET", "X_HANUNOO", "X_BUHID", "X_TAGBANWA", "X_TAI_LE",
25 "X_LINEAR_B", "X_UGARITIC", "X_SHAVIAN", "X_OSMANYA", "X_CYPRIOT",
26 "X_BUGINESE", "X_COPTIC", "X_NEW_TAI_LUE", "X_GLAGOLITIC", "X_TIFINAGH",
27 "X_SYLOTI_NAGRI", "X_OLD_PERSIAN", "X_KHAROSHTHI", "X_BALINESE", "X_CUNEIFORM",
28 "X_PHOENICIAN", "X_PHAGS_PA", "X_NKO",
31 "X_SUDANESE", "X_LEPCHA", "X_OL_CHIKI", "X_VAI", "X_SAURASHTRA",
32 "X_KAYAH_LI", "X_REJANG", "X_LYCIAN", "X_CARIAN", "X_LYDIAN",
37 // These are the C enum declared names, for programs creating C code
38 static const char* const kExtLangDeclaredName
[] = {
51 "PORTUGUESE", /* 12 */
60 "LITHUANIAN", /* 21 */
64 "TG_UNKNOWN_LANGUAGE", /* 25 */
65 "UNKNOWN_LANGUAGE", /* 26 */
75 "MACEDONIAN", /* 36 */
77 "INDONESIAN", /* 38 */
86 "BELARUSIAN", /* 47 */
97 "INTERLINGUA", /* 58 */
100 "SCOTS_GAELIC", /* 61 */
102 "SLOVENIAN", /* 63 */
105 "VIETNAMESE", /* 66 */
108 "CHINESE_T", /* 69 */
110 "SUNDANESE", /* 71 */
113 "AZERBAIJANI", /* 74 */
118 "SINHALESE", /* 79 */
119 "NORWEGIAN_N", /* 80 */
120 "PORTUGUESE_P", /* 81 */
121 "PORTUGUESE_B", /* 82 */
131 "SERBO_CROATIAN", /* 92 */
135 "MONGOLIAN", /* 96 */
139 "RHAETO_ROMANCE", /* 100 */
140 "AFRIKAANS", /* 101 */
141 "LUXEMBOURGISH", /* 102 */
145 "DHIVEHI", /* 106 */ // sometimes spelled Divehi; lang of Maldives
146 "CHEROKEE", /* 107 */
150 "ASSAMESE", /* 111 */
151 "CORSICAN", /* 112 */
152 "INTERLINGUE", /* 113 */
155 "MOLDAVIAN", /* 116 */
163 "CREOLES_AND_PIDGINS_ENGLISH_BASED", /* 124 */
164 "CREOLES_AND_PIDGINS_FRENCH_BASED", /* 125 */
165 "CREOLES_AND_PIDGINS_PORTUGUESE_BASED", /* 126 */
166 "CREOLES_AND_PIDGINS_OTHER", /* 127 */
169 "ABKHAZIAN", /* 130 */
174 "DZONGKHA", /* 135 */
176 "GREENLANDIC", /* 137 */
178 "HAITIAN_CREOLE", /* 139 */
180 "INUKTITUT", /* 141 */
181 "KASHMIRI", /* 142 */
182 "KINYARWANDA", /* 143 */
183 "MALAGASY", /* 144 */
189 "SANSKRIT", /* 150 */
199 "MONTENEGRIN", /* 160 */
200 // Add new language declared names just before here
203 COMPILE_ASSERT(arraysize(kExtLangDeclaredName
) == NUM_LANGUAGES
,
204 kExtLangDeclaredName_has_incorrect_length
);
207 // Language codes above NUM_LANGUAGES
208 // I made all these up, except Klingon from ISO-639-2 (dsites)
209 // NOTE: zza is a standard name
210 static const char* const kExtLanguageCode
[] = {
211 // "X_BORK_BORK_BORK", "X_PIG_LATIN", "X_HACKER", "X_KLINGON", "X_ELMER_FUDD",
213 "zzb", "zzp", "zzh", "tlh", "zze",
215 // Pseudo-languages for Unicode scripts that express a single language
216 "xx-Ogam", "xx-Runr", "xx-Yiii", "xx-Ital", "xx-Goth",
217 "xx-Dsrt", "xx-Hano", "xx-Buhd", "xx-Tagb", "xx-Tale",
218 "xx-Linb", "xx-Ugar", "xx-Shaw", "xx-Osma", "xx-Cprt",
219 "xx-Bugi", "xx-Copt", "xx-Talu", "xx-Glag", "xx-Tfng",
220 "xx-Sylo", "xx-Xpeo", "xx-Khar", "xx-Bali", "xx-Xsux",
221 "xx-Phnx", "xx-Phag", "xx-Nkoo",
224 "xx-Sund", "xx-Lepc", "xx-Olck", "xx-Vaii", "xx-Saur",
225 "xx-Kali", "xx-Rjng", "xx-Lyci", "xx-Cari", "xx-Lydi",
230 // Given the Language, returns its string name used as the output by
231 // the lang/enc identifier, e.g. "Korean"
232 // "invalid_language" if the input is invalid.
233 // TG_UNKNOWN_LANGUAGE is used as a placeholder for the "ignore me" language,
234 // used to subtract out HTML, link farms, DNA strings, and alittle English porn
235 const char* ExtLanguageName(const Language lang
) {
237 // No-text-at-all result from a Tote
240 // CompactLanguageDetect extension
241 if (lang
== TG_UNKNOWN_LANGUAGE
) {
244 if ((0 <= lang
) && (lang
< NUM_LANGUAGES
)) {
245 return LanguageName(lang
);
247 if ((EXT_LANGUAGE_BASE
<= lang
) && (lang
< EXT_NUM_LANGUAGES
)) {
248 return kExtLanguageName
[lang
- EXT_LANGUAGE_BASE
];
250 return invalid_language_name();
254 // Given the Language, returns its Language enum spelling, for use by
255 // programs that create C declarations, e.g. "KOREAN"
256 // "UNKNOWN_LANGUAGE" if the input is invalid.
257 const char* ExtLanguageDeclaredName(const Language lang
) {
258 if ((0 <= lang
) && (lang
< NUM_LANGUAGES
)) {
259 return kExtLangDeclaredName
[lang
];
261 if ((EXT_LANGUAGE_BASE
<= lang
) && (lang
< EXT_NUM_LANGUAGES
)) {
262 return kExtLanguageName
[lang
- EXT_LANGUAGE_BASE
];
264 return "UNKNOWN_LANGUAGE";
267 // Given the Language, return the language code, e.g. "ko"
268 const char* ExtLanguageCode(const Language lang
) {
269 // Hack for ignore/porn pseudo-language
270 if (lang
== TG_UNKNOWN_LANGUAGE
) {
273 if ((0 <= lang
) && (lang
< NUM_LANGUAGES
)) {
274 return LanguageCode(lang
);
276 if ((EXT_LANGUAGE_BASE
<= lang
) && (lang
< EXT_NUM_LANGUAGES
)) {
277 return kExtLanguageCode
[lang
- EXT_LANGUAGE_BASE
];
283 // Convert "en-Latn-GB" to ENGLISH
284 // Normalize to PORTUGUESE, not PORTUGUESE_B nor PORTUGUESE_P
285 // Consider for later: NORWEGIAN, NORWEGIAN_N
286 // Consider for later: SCOTS, SCOTS_GAELIC
287 // Consider for later: SERBO_CROATIAN, SERBIAN, CROATIAN, BOSNIAN
289 Language
GetLanguageFromNumberOrName(const char* src
) {
290 if (strspn(src
, "0123456789") == strlen(src
)) {
292 return static_cast<Language
>(strto32(src
, NULL
, 10));
295 Language retlang
= UNKNOWN_LANGUAGE
;
296 size_t len
= strlen(src
);
298 if (true /*FLAGS_mergepairs*/) {
299 // Merge sets of langauges pt-xx en-xx fr-xx, NOT bs/hr/sr
300 if (memcmp(src
, "pt-", 3) == 0) {return PORTUGUESE
;}
301 if (memcmp(src
, "en-", 3) == 0) {return ENGLISH
;}
302 if (memcmp(src
, "fr-", 3) == 0) {return FRENCH
;}
303 // Use NormalizeLanguage instead
304 if (memcmp(src
, "bs-", 3) == 0) {return CROATIAN
;}
305 if (memcmp(src
, "hr-", 3) == 0) {return CROATIAN
;}
306 if (memcmp(src
, "sr-Latn", 7) == 0) {return CROATIAN
;}
307 if (memcmp(src
, "sh-Latn", 7) == 0) {return CROATIAN
;}
308 if (memcmp(src
, "sr-Cyrl", 7) == 0) {return SERBIAN
;}
309 if (memcmp(src
, "sh-Cyrl", 7) == 0) {return SERBIAN
;}
314 // Standin for ignore/porn "language"
315 if (memcmp(src
, "xxx", 3) == 0) {return TG_UNKNOWN_LANGUAGE
;}
317 if (memcmp(src
, "zzb", 3) == 0) {return X_BORK_BORK_BORK
;}
318 if (memcmp(src
, "zzp", 3) == 0) {return X_PIG_LATIN
;}
319 if (memcmp(src
, "zzh", 3) == 0) {return X_HACKER
;}
320 if (memcmp(src
, "tlh", 3) == 0) {return X_KLINGON
;}
321 if (memcmp(src
, "zze", 3) == 0) {return X_ELMER_FUDD
;}
324 // We have a name like en-Latn-GB or pt-BR
325 // First, get rid of some special cases
327 LanguageFromCode(src
, &retlang
);
328 } else if (len
== 7) {
330 if (memcmp(src
, "xx-", 3) == 0) {
331 if (memcmp(src
, "xx-Ogam", 7) == 0) {return X_OGHAM
;}
332 if (memcmp(src
, "xx-Runr", 7) == 0) {return X_RUNIC
;}
333 if (memcmp(src
, "xx-Yiii", 7) == 0) {return X_YI
;}
334 if (memcmp(src
, "xx-Ital", 7) == 0) {return X_OLD_ITALIC
;}
335 if (memcmp(src
, "xx-Goth", 7) == 0) {return X_GOTHIC
;}
336 if (memcmp(src
, "xx-Dsrt", 7) == 0) {return X_DESERET
;}
337 if (memcmp(src
, "xx-Hano", 7) == 0) {return X_HANUNOO
;}
338 if (memcmp(src
, "xx-Buhd", 7) == 0) {return X_BUHID
;}
339 if (memcmp(src
, "xx-Tagb", 7) == 0) {return X_TAGBANWA
;}
340 if (memcmp(src
, "xx-Tale", 7) == 0) {return X_TAI_LE
;}
341 if (memcmp(src
, "xx-Linb", 7) == 0) {return X_LINEAR_B
;}
342 if (memcmp(src
, "xx-Ugar", 7) == 0) {return X_UGARITIC
;}
343 if (memcmp(src
, "xx-Shaw", 7) == 0) {return X_SHAVIAN
;}
344 if (memcmp(src
, "xx-Osma", 7) == 0) {return X_OSMANYA
;}
345 if (memcmp(src
, "xx-Cprt", 7) == 0) {return X_CYPRIOT
;}
346 if (memcmp(src
, "xx-Bugi", 7) == 0) {return X_BUGINESE
;}
347 if (memcmp(src
, "xx-Copt", 7) == 0) {return X_COPTIC
;}
348 if (memcmp(src
, "xx-Talu", 7) == 0) {return X_NEW_TAI_LUE
;}
349 if (memcmp(src
, "xx-Glag", 7) == 0) {return X_GLAGOLITIC
;}
350 if (memcmp(src
, "xx-Tfng", 7) == 0) {return X_TIFINAGH
;}
351 if (memcmp(src
, "xx-Sylo", 7) == 0) {return X_SYLOTI_NAGRI
;}
352 if (memcmp(src
, "xx-Xpeo", 7) == 0) {return X_OLD_PERSIAN
;}
353 if (memcmp(src
, "xx-Khar", 7) == 0) {return X_KHAROSHTHI
;}
354 if (memcmp(src
, "xx-Bali", 7) == 0) {return X_BALINESE
;}
355 if (memcmp(src
, "xx-Xsux", 7) == 0) {return X_CUNEIFORM
;}
356 if (memcmp(src
, "xx-Phnx", 7) == 0) {return X_PHOENICIAN
;}
357 if (memcmp(src
, "xx-Phag", 7) == 0) {return X_PHAGS_PA
;}
358 if (memcmp(src
, "xx-Nkoo", 7) == 0) {return X_NKO
;}
361 if (memcmp(src
, "xx-Sund", 7) == 0) {return X_SUDANESE
;}
362 if (memcmp(src
, "xx-Lepc", 7) == 0) {return X_LEPCHA
;}
363 if (memcmp(src
, "xx-Olck", 7) == 0) {return X_OL_CHIKI
;}
364 if (memcmp(src
, "xx-Vaii", 7) == 0) {return X_VAI
;}
365 if (memcmp(src
, "xx-Saur", 7) == 0) {return X_SAURASHTRA
;}
366 if (memcmp(src
, "xx-Kali", 7) == 0) {return X_KAYAH_LI
;}
367 if (memcmp(src
, "xx-Rjng", 7) == 0) {return X_REJANG
;}
368 if (memcmp(src
, "xx-Lyci", 7) == 0) {return X_LYCIAN
;}
369 if (memcmp(src
, "xx-Cari", 7) == 0) {return X_CARIAN
;}
370 if (memcmp(src
, "xx-Lydi", 7) == 0) {return X_LYDIAN
;}
371 if (memcmp(src
, "xx-Cham", 7) == 0) {return X_CHAM
;}
374 // Some other weird ones
375 // Could be Latn or Limb; all our current training data is Latn
376 if (strcmp(src
, "sit-NP") == 0) {return LIMBU
;}
377 if (strcmp(src
, "un-Latn") == 0) {return UNKNOWN_LANGUAGE
;}
379 // Multi-country langauges
380 if (memcmp(src
, "zh", 2) == 0) {
381 if (memcmp(&src
[len
- 2], "TW", 2) == 0) {return CHINESE_T
;}
382 if (memcmp(&src
[len
- 2], "HK", 2) == 0) {return CHINESE_T
;}
385 if (memcmp(src
, "pt", 2) == 0) {
386 if (memcmp(&src
[len
- 2], "BR", 2) == 0) {return PORTUGUESE
;}
389 if (memcmp(src
, "fr", 2) == 0) {
390 if (memcmp(&src
[len
-2], "CA", 2) == 0) {return FRENCH
;}
394 // None of the special cases matched
397 memcpy(temp
, src
, 4);
399 LanguageFromCode(temp
, &retlang
);
403 memcpy(temp
, src
, 4);
405 LanguageFromCode(temp
, &retlang
);
407 if (retlang
!= UNKNOWN_LANGUAGE
) {
416 UnicodeLScript lscript
;
419 // In alphabetic order for binary search
420 static const NameScriptPair kNameScriptPair
[] = {
421 // Unicode 5.1 additional scripts
422 {"Arab", ULScript_Arabic
},
423 {"Armn", ULScript_Armenian
},
424 {"Bali", ULScript_Balinese
},
425 {"Beng", ULScript_Bengali
},
426 {"Bugi", ULScript_Buginese
},
427 {"Buhd", ULScript_Buhid
},
428 {"Cans", ULScript_Canadian_Aboriginal
},
429 {"Cari", ULScript_Carian
}, // Unicode 5.1
430 {"Cham", ULScript_Cham
}, // Unicode 5.1
431 {"Cher", ULScript_Cherokee
},
432 {"Copt", ULScript_Coptic
},
433 {"Cprt", ULScript_Cypriot
},
434 {"Cyrl", ULScript_Cyrillic
},
435 {"Deva", ULScript_Devanagari
},
436 {"Dsrt", ULScript_Deseret
},
437 {"Ethi", ULScript_Ethiopic
},
438 {"Geor", ULScript_Georgian
},
439 {"Glag", ULScript_Glagolitic
},
440 {"Goth", ULScript_Gothic
},
441 {"Grek", ULScript_Greek
},
442 {"Gujr", ULScript_Gujarati
},
443 {"Guru", ULScript_Gurmukhi
},
444 {"Hani", ULScript_HanCJK
},
445 {"Hano", ULScript_Hanunoo
},
446 {"Hebr", ULScript_Hebrew
},
447 {"Ital", ULScript_Old_Italic
},
448 {"Kali", ULScript_Kayah_Li
}, // Unicode 5.1
449 {"Khar", ULScript_Kharoshthi
},
450 {"Khmr", ULScript_Khmer
},
451 {"Knda", ULScript_Kannada
},
452 {"Laoo", ULScript_Lao
},
453 {"Latn", ULScript_Latin
},
454 {"Lepc", ULScript_Lepcha
}, // Unicode 5.1
455 {"Limb", ULScript_Limbu
},
456 {"Linb", ULScript_Linear_B
},
457 {"Lyci", ULScript_Lycian
}, // Unicode 5.1
458 {"Lydi", ULScript_Lydian
}, // Unicode 5.1
459 {"Mlym", ULScript_Malayalam
},
460 {"Mong", ULScript_Mongolian
},
461 {"Mymr", ULScript_Myanmar
},
462 {"Nkoo", ULScript_Nko
},
463 {"Ogam", ULScript_Ogham
},
464 {"Olck", ULScript_Ol_Chiki
}, // Unicode 5.1
465 {"Orya", ULScript_Oriya
},
466 {"Osma", ULScript_Osmanya
},
467 {"Phag", ULScript_Phags_Pa
},
468 {"Phnx", ULScript_Phoenician
},
469 {"Rjng", ULScript_Rejang
}, // Unicode 5.1
470 {"Runr", ULScript_Runic
},
471 {"Saur", ULScript_Saurashtra
}, // Unicode 5.1
472 {"Shaw", ULScript_Shavian
},
473 {"Sinh", ULScript_Sinhala
},
474 {"Sund", ULScript_Sundanese
}, // Unicode 5.1
475 {"Sylo", ULScript_Syloti_Nagri
},
476 {"Syrc", ULScript_Syriac
},
477 {"Tagb", ULScript_Tagbanwa
},
478 {"Tale", ULScript_Tai_Le
},
479 {"Talu", ULScript_New_Tai_Lue
},
480 {"Taml", ULScript_Tamil
},
481 {"Telu", ULScript_Telugu
},
482 {"Tfng", ULScript_Tifinagh
},
483 {"Tglg", ULScript_Tagalog
},
484 {"Thaa", ULScript_Thaana
},
485 {"Thai", ULScript_Thai
},
486 {"Tibt", ULScript_Tibetan
},
487 {"Ugar", ULScript_Ugaritic
},
488 {"Vaii", ULScript_Vai
}, // Unicode 5.1 // NOTE: apparently 'Vai '
489 {"Xpeo", ULScript_Old_Persian
},
490 {"Xsux", ULScript_Cuneiform
},
491 {"Yiii", ULScript_Yi
},
492 {"Zyyy", ULScript_Common
},
493 {"Zzzz", ULScript_Inherited
},
496 // Convert "en-Latn-GB" to ULScript_Latin
497 UnicodeLScript
GetLScriptFromNumberOrName(const char* src
) {
498 if (strspn(src
, "0123456789") == strlen(src
)) {
500 return static_cast<UnicodeLScript
>(strto32(src
, NULL
, 10));
503 if (strcmp(src
, "zh-TW") == 0) {return ULScript_HanCJK
;}
504 if (strcmp(src
, "zh-CN") == 0) {return ULScript_HanCJK
;}
505 if (strcmp(src
, "pt-BR") == 0) {return ULScript_Latin
;}
506 if (strcmp(src
, "pt-PT") == 0) {return ULScript_Latin
;}
507 // Could be Latn or Limb; all our current training data is Latn
508 if (strcmp(src
, "sit-NP") == 0) {return ULScript_Latin
;}
510 // Isolate just the script field
512 const char* src2
= strchr(src
, '-');
513 if (src2
== NULL
) {return ULScript_Latin
;}
514 src2
+= 1; // over the -
515 memcpy(temp
, src2
, 4);
519 int hi
= ULScript_NUM_SCRIPTS
;
521 int mid
= (lo
+ hi
) >> 1;
522 if (strcmp(temp
, kNameScriptPair
[mid
].name
) < 0) {
524 } else if (strcmp(temp
, kNameScriptPair
[mid
].name
) > 0) {
527 return kNameScriptPair
[mid
].lscript
;
530 return ULScript_Latin
;
534 // Merge together some languages, such as bo/hr/sr
535 // Croatian Latin and Serbian Cyrillic now.
536 Language
NormalizeLanguage(Language lang
) {
537 if (lang
== BOSNIAN
) {return CROATIAN
;}
538 if (lang
== SERBO_CROATIAN
) {return SERBIAN
;}
540 if (lang
== PORTUGUESE_P
) {return PORTUGUESE
;}
541 if (lang
== PORTUGUESE_B
) {return PORTUGUESE
;}