1 // Copyright (c) 2009 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "languages/public/languages.h"
7 #include "base/string_util.h"
8 #include "encodings/compact_lang_det/win/cld_basictypes.h"
11 Language
default_language() {return ENGLISH
;}
14 // Language names and codes
17 const char * language_name_
;
18 const char * language_code_639_1_
; // the ISO-639-1 code for the language
19 const char * language_code_639_2_
; // the ISO-639-2 code for the language
20 const char * language_code_other_
; // some nonstandard code for the language
23 static const LanguageInfo kLanguageInfoTable
[] = {
24 { "ENGLISH", "en", "eng", NULL
},
25 { "DANISH", "da", "dan", NULL
},
26 { "DUTCH", "nl", "dut", NULL
},
27 { "FINNISH", "fi", "fin", NULL
},
28 { "FRENCH", "fr", "fre", NULL
},
29 { "GERMAN", "de", "ger", NULL
},
30 { "HEBREW", "he", "heb", NULL
},
31 { "ITALIAN", "it", "ita", NULL
},
32 { "Japanese", "ja", "jpn", NULL
},
33 { "Korean", "ko", "kor", NULL
},
34 { "NORWEGIAN", "nb", "nor", NULL
},
35 { "POLISH", "pl", "pol", NULL
},
36 { "PORTUGUESE", "pt", "por", NULL
},
37 { "RUSSIAN", "ru", "rus", NULL
},
38 { "SPANISH", "es", "spa", NULL
},
39 { "SWEDISH", "sv", "swe", NULL
},
40 { "Chinese", "zh", "chi", "zh-CN"},
41 { "CZECH", "cs", "cze", NULL
},
42 { "GREEK", "el", "gre", NULL
},
43 { "ICELANDIC", "is", "ice", NULL
},
44 { "LATVIAN", "lv", "lav", NULL
},
45 { "LITHUANIAN", "lt", "lit", NULL
},
46 { "ROMANIAN", "ro", "rum", NULL
},
47 { "HUNGARIAN", "hu", "hun", NULL
},
48 { "ESTONIAN", "et", "est", NULL
},
49 // TODO: Although Teragram has two output names "TG_UNKNOWN_LANGUAGE"
50 // and "Unknown", they are essentially the same. Need to unify them.
51 // "un" and "ut" are invented by us, not from ISO-639.
53 { "TG_UNKNOWN_LANGUAGE", NULL
, NULL
, "ut"},
54 { "Unknown", NULL
, NULL
, "un"},
55 { "BULGARIAN", "bg", "bul", NULL
},
56 { "CROATIAN", "hr", "scr", NULL
},
57 { "SERBIAN", "sr", "scc", NULL
},
58 { "IRISH", "ga", "gle", NULL
},
59 { "GALICIAN", "gl", "glg", NULL
},
60 // Impossible to tell Tagalog from Filipino at the moment.
61 // Use ISO 639-2 code for Filipino here.
62 { "TAGALOG", NULL
, "fil", NULL
},
63 { "TURKISH", "tr", "tur", NULL
},
64 { "UKRAINIAN", "uk", "ukr", NULL
},
65 { "HINDI", "hi", "hin", NULL
},
66 { "MACEDONIAN", "mk", "mac", NULL
},
67 { "BENGALI", "bn", "ben", NULL
},
68 { "INDONESIAN", "id", "ind", NULL
},
69 { "LATIN", "la", "lat", NULL
},
70 { "MALAY", "ms", "may", NULL
},
71 { "MALAYALAM", "ml", "mal", NULL
},
72 { "WELSH", "cy", "wel", NULL
},
73 { "NEPALI", "ne", "nep", NULL
},
74 { "TELUGU", "te", "tel", NULL
},
75 { "ALBANIAN", "sq", "alb", NULL
},
76 { "TAMIL", "ta", "tam", NULL
},
77 { "BELARUSIAN", "be", "bel", NULL
},
78 { "JAVANESE", "jw", "jav", NULL
},
79 { "OCCITAN", "oc", "oci", NULL
},
80 { "URDU", "ur", "urd", NULL
},
81 { "BIHARI", "bh", "bih", NULL
},
82 { "GUJARATI", "gu", "guj", NULL
},
83 { "THAI", "th", "tha", NULL
},
84 { "ARABIC", "ar", "ara", NULL
},
85 { "CATALAN", "ca", "cat", NULL
},
86 { "ESPERANTO", "eo", "epo", NULL
},
87 { "BASQUE", "eu", "baq", NULL
},
88 { "INTERLINGUA", "ia", "ina", NULL
},
89 { "KANNADA", "kn", "kan", NULL
},
90 { "PUNJABI", "pa", "pan", NULL
},
91 { "SCOTS_GAELIC", "gd", "gla", NULL
},
92 { "SWAHILI", "sw", "swa", NULL
},
93 { "SLOVENIAN", "sl", "slv", NULL
},
94 { "MARATHI", "mr", "mar", NULL
},
95 { "MALTESE", "mt", "mlt", NULL
},
96 { "VIETNAMESE", "vi", "vie", NULL
},
97 { "FRISIAN", "fy", "fry", NULL
},
98 { "SLOVAK", "sk", "slo", NULL
},
100 NULL
, NULL
, // We intentionally set these 2 fields to NULL to avoid
101 // confusion between CHINESE_T and CHINESE.
103 { "FAROESE", "fo", "fao", NULL
},
104 { "SUNDANESE", "su", "sun", NULL
},
105 { "UZBEK", "uz", "uzb", NULL
},
106 { "AMHARIC", "am", "amh", NULL
},
107 { "AZERBAIJANI", "az", "aze", NULL
},
108 { "GEORGIAN", "ka", "geo", NULL
},
109 { "TIGRINYA", "ti", "tir", NULL
},
110 { "PERSIAN", "fa", "per", NULL
},
111 { "BOSNIAN", "bs", "bos", NULL
},
112 { "SINHALESE", "si", "sin", NULL
},
113 { "NORWEGIAN_N", "nn", "nno", NULL
},
114 { "PORTUGUESE_P", NULL
, NULL
, "pt-PT"},
115 { "PORTUGUESE_B", NULL
, NULL
, "pt-BR"},
116 { "XHOSA", "xh", "xho", NULL
},
117 { "ZULU", "zu", "zul", NULL
},
118 { "GUARANI", "gn", "grn", NULL
},
119 { "SESOTHO", "st", "sot", NULL
},
120 { "TURKMEN", "tk", "tuk", NULL
},
121 { "KYRGYZ", "ky", "kir", NULL
},
122 { "BRETON", "br", "bre", NULL
},
123 { "TWI", "tw", "twi", NULL
},
124 { "YIDDISH", "yi", "yid", NULL
},
125 { "SERBO_CROATIAN", "sh", NULL
, NULL
},
126 { "SOMALI", "so", "som", NULL
},
127 { "UIGHUR", "ug", "uig", NULL
},
128 { "KURDISH", "ku", "kur", NULL
},
129 { "MONGOLIAN", "mn", "mon", NULL
},
130 { "ARMENIAN", "hy", "arm", NULL
},
131 { "LAOTHIAN", "lo", "lao", NULL
},
132 { "SINDHI", "sd", "snd", NULL
},
133 { "RHAETO_ROMANCE", "rm", "roh", NULL
},
134 { "AFRIKAANS", "af", "afr", NULL
},
135 { "LUXEMBOURGISH", "lb", "ltz", NULL
},
136 { "BURMESE", "my", "bur", NULL
},
137 // KHMER is known as Cambodian for Google user interfaces.
138 { "KHMER", "km", "khm", NULL
},
139 { "TIBETAN", "bo", "tib", NULL
},
140 { "DHIVEHI", "dv", "div", NULL
},
141 { "CHEROKEE", NULL
, "chr", NULL
},
142 { "SYRIAC", NULL
, "syr", NULL
},
143 { "LIMBU", NULL
, NULL
, "sit-NP"},
144 { "ORIYA", "or", "ori", NULL
},
145 { "ASSAMESE", "as", "asm", NULL
},
146 { "CORSICAN", "co", "cos", NULL
},
147 { "INTERLINGUE", "ie", "ine", NULL
},
148 { "KAZAKH", "kk", "kaz", NULL
},
149 { "LINGALA", "ln", "lin", NULL
},
150 { "MOLDAVIAN", "mo", "mol", NULL
},
151 { "PASHTO", "ps", "pus", NULL
},
152 { "QUECHUA", "qu", "que", NULL
},
153 { "SHONA", "sn", "sna", NULL
},
154 { "TAJIK", "tg", "tgk", NULL
},
155 { "TATAR", "tt", "tat", NULL
},
156 { "TONGA", "to", "tog", NULL
},
157 { "YORUBA", "yo", "yor", NULL
},
158 { "CREOLES_AND_PIDGINS_ENGLISH_BASED", NULL
, "cpe", NULL
},
159 { "CREOLES_AND_PIDGINS_FRENCH_BASED", NULL
, "cpf", NULL
},
160 { "CREOLES_AND_PIDGINS_PORTUGUESE_BASED", NULL
, "cpp", NULL
},
161 { "CREOLES_AND_PIDGINS_OTHER", NULL
, "crp", NULL
},
162 { "MAORI", "mi", "mao", NULL
},
163 { "WOLOF", "wo", "wol", NULL
},
164 { "ABKHAZIAN", "ab", "abk", NULL
},
165 { "AFAR", "aa", "aar", NULL
},
166 { "AYMARA", "ay", "aym", NULL
},
167 { "BASHKIR", "ba", "bak", NULL
},
168 { "BISLAMA", "bi", "bis", NULL
},
169 { "DZONGKHA", "dz", "dzo", NULL
},
170 { "FIJIAN", "fj", "fij", NULL
},
171 { "GREENLANDIC", "kl", "kal", NULL
},
172 { "HAUSA", "ha", "hau", NULL
},
173 { "HAITIAN_CREOLE", "ht", NULL
, NULL
},
174 { "INUPIAK", "ik", "ipk", NULL
},
175 { "INUKTITUT", "iu", "iku", NULL
},
176 { "KASHMIRI", "ks", "kas", NULL
},
177 { "KINYARWANDA", "rw", "kin", NULL
},
178 { "MALAGASY", "mg", "mlg", NULL
},
179 { "NAURU", "na", "nau", NULL
},
180 { "OROMO", "om", "orm", NULL
},
181 { "RUNDI", "rn", "run", NULL
},
182 { "SAMOAN", "sm", "smo", NULL
},
183 { "SANGO", "sg", "sag", NULL
},
184 { "SANSKRIT", "sa", "san", NULL
},
185 { "SISWANT", "ss", "ssw", NULL
},
186 { "TSONGA", "ts", "tso", NULL
},
187 { "TSWANA", "tn", "tsn", NULL
},
188 { "VOLAPUK", "vo", "vol", NULL
},
189 { "ZHUANG", "za", "zha", NULL
},
190 { "KHASI", NULL
, "kha", NULL
},
191 { "SCOTS", NULL
, "sco", NULL
},
192 { "GANDA", "lg", "lug", NULL
},
193 { "MANX", "gv", "glv", NULL
},
194 { "MONTENEGRIN", NULL
, NULL
, "sr-ME"},
195 { "XX", NULL
, NULL
, "XX"},
198 COMPILE_ASSERT(arraysize(kLanguageInfoTable
) == NUM_LANGUAGES
+ 1,
199 kLanguageInfoTable_has_incorrect_length
);
204 const char* default_language_name() {
205 return kLanguageInfoTable
[ENGLISH
].language_name_
;
208 static const char* const kInvalidLanguageName
= "invalid_language";
210 const char *invalid_language_name() {
211 return kInvalidLanguageName
;
214 const char* LanguageName(Language lang
) {
215 return IsValidLanguage(lang
)
216 ? kLanguageInfoTable
[lang
].language_name_
217 : kInvalidLanguageName
;
225 // The space before invalid_language_code is intentional. It is used
226 // to prevent it matching any two letter language code.
228 static const char* const kInvalidLanguageCode
= " invalid_language_code";
230 const char *invalid_language_code() {
231 return kInvalidLanguageCode
;
234 const char * LanguageCode(Language lang
) {
235 if (! IsValidLanguage(lang
))
236 return kInvalidLanguageCode
;
237 const LanguageInfo
& info
= kLanguageInfoTable
[lang
];
238 if (info
.language_code_639_1_
) {
239 return info
.language_code_639_1_
;
240 } else if (info
.language_code_639_2_
) {
241 return info
.language_code_639_2_
;
242 } else if (info
.language_code_other_
) {
243 return info
.language_code_other_
;
245 return kInvalidLanguageCode
;
249 const char* default_language_code() {
250 return kLanguageInfoTable
[ENGLISH
].language_code_639_1_
;
253 const char* LanguageCodeISO639_1(Language lang
) {
254 if (! IsValidLanguage(lang
))
255 return kInvalidLanguageCode
;
256 if (const char* code
= kLanguageInfoTable
[lang
].language_code_639_1_
)
258 return kInvalidLanguageCode
;
261 const char* LanguageCodeISO639_2(Language lang
) {
262 if (! IsValidLanguage(lang
))
263 return kInvalidLanguageCode
;
264 if (const char* code
= kLanguageInfoTable
[lang
].language_code_639_2_
)
266 return kInvalidLanguageCode
;
269 const char* LanguageCodeWithDialects(Language lang
) {
272 return LanguageCode(lang
);
277 bool LanguageFromCode(const char* lang_code
, Language
*language
) {
278 *language
= UNKNOWN_LANGUAGE
;
279 if ( lang_code
== NULL
) return false;
281 for ( int i
= 0 ; i
< kNumLanguages
; i
++ ) {
282 const LanguageInfo
& info
= kLanguageInfoTable
[i
];
283 if ((info
.language_code_639_1_
&&
284 !base::strcasecmp(lang_code
, info
.language_code_639_1_
)) ||
285 (info
.language_code_639_2_
&&
286 !base::strcasecmp(lang_code
, info
.language_code_639_2_
)) ||
287 (info
.language_code_other_
&&
288 !base::strcasecmp(lang_code
, info
.language_code_other_
))) {
289 *language
= static_cast<Language
>(i
);
294 // For convenience, this function can also parse the non-standard
295 // five-letter language codes "zh-cn" and "zh-tw" which are used by
296 // front-ends such as GWS to distinguish Simplified from Traditional
298 if (!base::strcasecmp(lang_code
, "zh-cn") ||
299 !base::strcasecmp(lang_code
, "zh_cn")) {
303 if (!base::strcasecmp(lang_code
, "zh-tw") ||
304 !base::strcasecmp(lang_code
, "zh_tw")) {
305 *language
= CHINESE_T
;
308 if (!base::strcasecmp(lang_code
, "sr-me") ||
309 !base::strcasecmp(lang_code
, "sr_me")) {
310 *language
= MONTENEGRIN
;
314 // Process language-code synonyms.
315 if (!base::strcasecmp(lang_code
, "he")) {
316 *language
= HEBREW
; // Use "iw".
319 if (!base::strcasecmp(lang_code
, "in")) {
320 *language
= INDONESIAN
; // Use "id".
323 if (!base::strcasecmp(lang_code
, "ji")) {
324 *language
= YIDDISH
; // Use "yi".
328 // Process language-detection synonyms.
329 // These distinct languages cannot be differentiated by our current
330 // language-detection algorithms.
331 if (!base::strcasecmp(lang_code
, "fil")) {