Merge Chromium + Blink git repositories
[chromium-blink-merge.git] / third_party / cld / languages / public / languages.h
blob9d67f6c88fb1911f0aebdd17f1516607d1a22d21
1 // Copyright (c) 2009 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #ifndef LANGUAGES_PUBLIC_LANGUAGES_H_
6 #define LANGUAGES_PUBLIC_LANGUAGES_H_
8 // This interface defines the Language enum and functions that depend
9 // only on Language values.
11 // A hash-function for Language, hash<Language>, is defined in
12 // i18n/languages/public/languages-hash.h
14 #ifndef SWIG
15 // Language enum defined in languages.proto
16 // Also description on how to add languages.
17 #include "languages/proto/languages.pb.h"
19 // We need this for compatibility:
20 // - The Language enum in the default namespace.
21 // COMMENTED OUT TO REDUCE DEPENDENCIES ON GOOGLE3 CODE
22 //using namespace i18n::languages;
24 #else
25 // And we must have a swig-compatible enum.
26 // This one is a simple cleaned up version of language.proto, making the enum
27 // compatible with C++.
28 #include "i18n/languages/internal/languages_proto_wrapper.h"
30 #endif
32 // COMMENTED OUT TO REDUCE DEPENDENCIES ON GOOGLE3 CODE
33 //#include "util/utf8/proptables/script_enum.h"
35 const int kNumLanguages = NUM_LANGUAGES;
37 // Return the default language (ENGLISH).
38 Language default_language();
41 // *******************************************
42 // Language predicates
43 // IsValidLanguage()
44 // IS_LANGUAGE_UNKNOWN()
45 // IsCJKLanguage()
46 // IsChineseLanguage()
47 // IsNorwegianLanguage()
48 // IsPortugueseLanguage()
49 // IsRightToLeftLanguage()
50 // IsMaybeRightToLeftLanguage()
51 // IsSameLanguage()
52 // IsScriptRequiringLongerSnippets()
53 // *******************************************
55 // IsValidLanguage
56 // ===============
58 // Function to check if the input is within range of the Language enum. If
59 // IsValidLanguage(lang) returns true, it is safe to call
60 // static_cast<Language>(lang).
62 inline bool IsValidLanguage(int lang) {
63 return ((lang >= 0) && (lang < kNumLanguages));
66 // Return true if the language is "unknown". (This function was
67 // previously a macro, hence the spelling in all caps.)
69 inline bool IS_LANGUAGE_UNKNOWN(Language lang) {
70 return lang == TG_UNKNOWN_LANGUAGE || lang == UNKNOWN_LANGUAGE;
73 // IsCJKLanguage
74 // -------------
76 // This function returns true if the language is either Chinese
77 // (simplified or traditional), Japanese, or Korean.
78 bool IsCJKLanguage(Language lang);
80 // IsChineseLanguage
81 // -----------------
83 // This function returns true if the language is either Chinese
84 // (simplified or traditional)
85 bool IsChineseLanguage(Language lang);
87 // IsNorwegianLanguage
88 // --------------------
90 // This function returns true if the language is any of the Norwegian
91 // (regular or Nynorsk).
92 bool IsNorwegianLanguage(Language lang);
94 // IsPortugueseLanguage
95 // --------------------
97 // This function returns true if the language is any of the Portuguese
98 // languages (regular, Portugal or Brazil)
99 bool IsPortugueseLanguage(Language lang);
101 // IsSameLanguage
102 // --------------
104 // WARNING: This function provides only a simple test on the values of
105 // the two Language arguments. It returns false if either language is
106 // invalid. It returns true if the language arguments are equal, or
107 // if they are both Chinese languages, both Norwegian languages, or
108 // both Portuguese languages, as defined by IsChineseLanguage,
109 // IsNorwegianLanguage, and IsPortugueseLanguage. Otherwise it returns
110 // false.
111 bool IsSameLanguage(Language lang1, Language lang2);
114 // IsRightToLeftLanguage
115 // ---------------------
117 // This function returns true if the language is only written right-to-left
118 // (E.g., Hebrew, Arabic, Persian etc.)
120 // IMPORTANT NOTE: Technically we're talking about scripts, not languages.
121 // There are languages that can be written in more than one script.
122 // Examples:
123 // - Kurdish and Azeri ('AZERBAIJANI') can be written left-to-right in
124 // Latin or Cyrillic script, and right-to-left in Arabic script.
125 // - Sindhi and Punjabi are written in different scripts, depending on
126 // region and dialect.
127 // - Turkmen used an Arabic script historically, but not any more.
128 // - Pashto and Uyghur can use Arabic script, but use a Roman script
129 // on the Internet.
130 // - Kashmiri and Urdu are written either with Arabic or Devanagari script.
132 // This function only returns true for languages that are always, unequivocally
133 // written in right-to-left script.
135 // TODO(benjy): If we want to do anything special with multi-script languages
136 // we should create new 'languages' for each language+script, as we do for
137 // traditional vs. simplified Chinese. However most such languages are rare in
138 // use and even rarer on the web, so this is unlikely to be something we'll
139 // be concerned with for a while.
140 bool IsRightToLeftLanguage(Language lang);
142 // IsMaybeRightToLeftLanguage
143 // --------------------------
145 // This function returns true if the language may appear on the web in a
146 // right-to-left script (E.g., Hebrew, Arabic, Persian, Urdu, Kurdish, etc.)
148 // NOTE: See important notes under IsRightToLeftLanguage(...).
150 // This function returns true for languages that *may* appear on the web in a
151 // right-to-left script, even if they may also appear in a left-to-right
152 // script.
154 // This function should typically be used in cases where doing some work on
155 // left-to-right text would be OK (usually a no-op), and this function is used
156 // just to cut down on unnecessary work on regular, LTR text.
157 bool IsMaybeRightToLeftLanguage(Language lang);
159 // IsScriptRequiringLongerSnippets
160 // --------------------
162 // This function returns true if the script chracteristics require longer
163 // snippet length (Devanagari, Bengali, Gurmukhi,
164 // Gujarati, Oriya, Tamil, Telugu, Kannada, Malayalam).
165 // COMMENTED OUT TO REDUCE DEPENDENCIES ON GOOGLE3 CODE
166 // bool IsScriptRequiringLongerSnippets(UnicodeScript script);
169 // *******************************************
170 // LANGUAGE NAMES
172 // This interface defines a standard name for each valid Language,
173 // and a standard name for invalid languages. Some language names use all
174 // uppercase letters, but others use mixed case.
175 // LanguageName() [Language to name]
176 // LanguageEnumName() [language to enum name]
177 // LanguageFromName() [name to Language]
178 // default_language_name()
179 // invalid_language_name()
180 // *******************************************
182 // Given a Language, returns its standard name.
183 // Return invalid_language_name() if the language is invalid.
184 const char* LanguageName(Language lang);
186 // Given a Language, return the name of the enum constant for that
187 // language. In all but a few cases, this is the same as its standard
188 // name. For example, LanguageName(CHINESE) returns "Chinese", but
189 // LanguageEnumName(CHINESE) returns "CHINESE". This is intended for
190 // code that is generating C++ code, where the enum constant is more
191 // useful than its integer value. Return "NUM_LANGUAGES" if
192 // the language is invalid.
193 const char* LanguageEnumName(Language lang);
195 // The maximum length of a standard language name.
196 const int kMaxLanguageNameSize = 50;
198 // The standard name for the default language.
199 const char* default_language_name();
201 // The standard name for all invalid languages.
202 const char* invalid_language_name();
204 // If lang_name matches the standard name of a Language, using a
205 // case-insensitive comparison, set *language to that Language and
206 // return true.
207 // Otherwise, set *language to UNKNOWN_LANGUAGE and return false.
209 // For backwards compatibility, "HATIAN_CREOLE" is allowed as a name
210 // for HAITIAN_CREOLE, and "QUECHAU" is allowed as a name for QUECHUA.
211 // For compatibility with LanguageEnumName, "UNKNOWN_LANGUAGE" is allowed
212 // as a name for UNKNOWN_LANGUAGE (the return value is true in this case,
213 // as it is for "Unknown"), and "CHINESE_T" is allowed as a name for
214 // CHINESE_T (i.e., a synonym for "ChineseT").
216 // REQUIRES: language must not be NULL.
218 bool LanguageFromName(const char* lang_name, Language *language);
222 // *******************************************
223 // LANGUAGE CODES
225 // This interface defines a standard code for each valid language, and
226 // a standard code for invalid languages. These are derived from ISO codes,
227 // with some Google additions.
228 // LanguageCode()
229 // default_language_code()
230 // invalid_language_code()
231 // LanguageCodeWithDialects()
232 // LanguageCodeISO639_1()
233 // LanguageCodeISO639_2()
234 // *******************************************
236 // Given a Language, return its standard code. There are Google-specific codes:
237 // For CHINESE_T, return "zh-TW".
238 // For TG_UNKNOWN_LANGUAGE, return "ut".
239 // For UNKNOWN_LANGUAGE, return "un".
240 // For PORTUGUESE_P, return "pt-PT".
241 // For PORTUGUESE_B, return "pt-BR".
242 // For LIMBU, return "sit-NP".
243 // For CHEROKEE, return "chr".
244 // For SYRIAC, return "syr".
245 // Otherwise return the ISO 639-1 two-letter language code for lang.
246 // If lang is invalid, return invalid_language_code().
248 // NOTE: See the note below about the codes for Chinese languages.
250 const char* LanguageCode(Language lang);
252 // The maximum length of a language code.
253 const int kMaxLanguageCodeSize = 50;
255 // The standard code for the default language.
256 const char* default_language_code();
258 // The standard code for all invalid languages.
259 const char* invalid_language_code();
262 // --------------------------------------------
263 // NOTE: CHINESE LANGUAGE CODES
265 // There are three functions that return codes for Chinese languages.
266 // LanguageCode(lang) and LanguageCodeWithDialects(lang) are defined here.
267 // LanguageCode(lang, encoding) is defined in i18n/encodings.lang_enc.h.
268 // The following list shows the different results.
270 // LanguageCode(CHINESE) returns "zh"
271 // LanguageCode(CHINESE_T) returns "zh-TW".
273 // LanguageCodeWithDialects(CHINESE) returns "zh-CN".
274 // LanguageCodeWithDialects(CHINESE_T) returns "zh-TW".
276 // LanguageCode(CHINESE_T, <any encoding>) returns "zh-TW".
277 // LanguageCode(CHINESE, CHINESE_BIG5) returns "zh-TW".
278 // LanguageCode(CHINESE, <any other encoding>) returns "zh-CN".
280 // --------------------------------------------
282 // LanguageCodeWithDialects
283 // ------------------------
285 // If lang is CHINESE, return "zh-CN". Otherwise return LanguageCode(lang).
286 const char* LanguageCodeWithDialects(Language lang);
288 // LanguageCodeISO639_1
289 // --------------------
291 // Return the ISO 639-1 two-letter language code for lang.
292 // Return invalid_language_code() if lang is invalid or does not have
293 // an ISO 639-1 two-letter language code.
294 const char* LanguageCodeISO639_1(Language lang);
296 // LanguageCodeISO639_2
297 // --------------------
299 // Return the ISO 639-2 three-letter language for lang.
300 // Return invalid_language_code() if lang is invalid or does not have
301 // an ISO 639-2 three-letter language code.
302 const char* LanguageCodeISO639_2(Language lang);
304 // LanguageFromCode
305 // ----------------
307 // If lang_code matches the code for a Language, using a case-insensitive
308 // comparison, set *lang to that Language and return true.
309 // Otherwise, set *lang to UNKNOWN_LANGUAGE and return false.
311 // lang_code can be an ISO 639-1 (two-letter) code, an ISO 639-2
312 // (three-letter) code, or a Google-specific code (see LanguageCode).
314 // Certain language-code aliases are also allowed:
315 // For "zh-cn" and "zh_cn", set *lang to CHINESE.
316 // For "zh-tw" and "zh_tw", set *lang to CHINESE_T.
317 // For "he", set *lang to HEBREW.
318 // For "in", set *lang to INDONESIAN.
319 // For "ji", set *lang to YIDDISH.
320 // For "fil", set *lang to TAGALOG.
322 // REQUIRES: 'lang' must not be NULL.
323 bool LanguageFromCode(const char* lang_code, Language *language);
326 // LanguageFromCodeOrName
327 // ----------------------
329 // If lang_code_or_name is a language code or a language name.
330 // set *language to the corresponding Language and return true.
331 // Otherwise set *language to UNKNOWN_LANGUAGE and return false.
333 bool LanguageFromCodeOrName(const char* lang_code_or_name,
334 Language* language);
336 // LanguageNameFromCode
337 // --------------------
339 // If language_code is the code for a Language (see LanguageFromCode),
340 // return the standard name of that language (see LanguageName).
341 // Otherwise return invalid_language_name().
343 const char* LanguageNameFromCode(const char* language_code);
346 // Miscellany
348 // LanguageCodeToUnderscoreForm
349 // ----------------------------
351 // Given a language code, convert the dash "-" to underscore "_".
353 // Specifically, if result_length <= strlen(lang_code), set result[0]
354 // to '\0' and return false. Otherwise, copy lang_code to result,
355 // converting every dash to an underscore, converting every character
356 // before the first dash or underscore to lower case, and converting
357 // every character after the first dash or underscore to upper
358 // case. If there is no dash or underscore, convert the entire string
359 // to lower case.
361 // REQUIRES: 'lang_code' must not be NULL. 'result' must not be NULL.
363 bool LanguageCodeToUnderscoreForm(const char* lang_code,
364 char* result,
365 int result_length);
368 // AlwaysPutInExpectedRestrict
369 // ---------------------------
371 // For Web pages in certain top-level domains, Web Search always
372 // applies a "country restrict". If 'tld' matches one of those, using
373 // a case-SENSITIVE comparison, set *expected_language to the Language
374 // most commonly found in that top-level domain and return true.
375 // Otherwise, set *expected_language to UNKNOWN_LANGUAGE and return false.
376 bool AlwaysPutInExpectedRestrict(const char *tld, Language *expected_language);
379 #endif // LANGUAGES_PUBLIC_LANGUAGES_H_