third_party/cld/languages/public/languages.h

   1 // Copyright (c) 2009 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #ifndef LANGUAGES_PUBLIC_LANGUAGES_H_
   6 #define LANGUAGES_PUBLIC_LANGUAGES_H_
   7
   8 // This interface defines the Language enum and functions that depend
   9 // only on Language values.
  10
  11 // A hash-function for Language, hash<Language>, is defined in
  12 // i18n/languages/public/languages-hash.h
  13
  14 #ifndef SWIG
  15 // Language enum defined in languages.proto
  16 // Also description on how to add languages.
  17 #include "languages/proto/languages.pb.h"
  18
  19 // We need this for compatibility:
  20 // - The Language enum in the default namespace.
  21 // COMMENTED OUT TO REDUCE DEPENDENCIES ON GOOGLE3 CODE
  22 //using namespace i18n::languages;
  23
  24 #else
  25 // And we must have a swig-compatible enum.
  26 // This one is a simple cleaned up version of language.proto, making the enum
  27 // compatible with C++.
  28 #include "i18n/languages/internal/languages_proto_wrapper.h"
  29
  30 #endif
  31
  32 // COMMENTED OUT TO REDUCE DEPENDENCIES ON GOOGLE3 CODE
  33 //#include "util/utf8/proptables/script_enum.h"
  34
  35 const int kNumLanguages = NUM_LANGUAGES;
  36
  37 // Return the default language (ENGLISH).
  38 Language default_language();
  39
  40
  41 // *******************************************
  42 // Language predicates
  43 //   IsValidLanguage()
  44 //   IS_LANGUAGE_UNKNOWN()
  45 //   IsCJKLanguage()
  46 //   IsChineseLanguage()
  47 //   IsNorwegianLanguage()
  48 //   IsPortugueseLanguage()
  49 //   IsRightToLeftLanguage()
  50 //   IsMaybeRightToLeftLanguage()
  51 //   IsSameLanguage()
  52 //   IsScriptRequiringLongerSnippets()
  53 // *******************************************
  54
  55 // IsValidLanguage
  56 // ===============
  57 //
  58 // Function to check if the input is within range of the Language enum. If
  59 // IsValidLanguage(lang) returns true, it is safe to call
  60 // static_cast<Language>(lang).
  61 //
  62 inline bool IsValidLanguage(int lang) {
  63   return ((lang >= 0) && (lang < kNumLanguages));
  64 }
  65
  66 // Return true if the language is "unknown". (This function was
  67 // previously a macro, hence the spelling in all caps.)
  68 //
  69 inline bool IS_LANGUAGE_UNKNOWN(Language lang) {
  70   return lang == TG_UNKNOWN_LANGUAGE || lang == UNKNOWN_LANGUAGE;
  71 }
  72
  73 // IsCJKLanguage
  74 // -------------
  75 //
  76 // This function returns true if the language is either Chinese
  77 // (simplified or traditional), Japanese, or Korean.
  78 bool IsCJKLanguage(Language lang);
  79
  80 // IsChineseLanguage
  81 // -----------------
  82 //
  83 // This function returns true if the language is either Chinese
  84 // (simplified or traditional)
  85 bool IsChineseLanguage(Language lang);
  86
  87 // IsNorwegianLanguage
  88 // --------------------
  89 //
  90 // This function returns true if the language is any of the Norwegian
  91 // (regular or Nynorsk).
  92 bool IsNorwegianLanguage(Language lang);
  93
  94 // IsPortugueseLanguage
  95 // --------------------
  96 //
  97 // This function returns true if the language is any of the Portuguese
  98 // languages (regular, Portugal or Brazil)
  99 bool IsPortugueseLanguage(Language lang);
 100
 101 // IsSameLanguage
 102 // --------------
 103 //
 104 // WARNING: This function provides only a simple test on the values of
 105 // the two Language arguments. It returns false if either language is
 106 // invalid. It returns true if the language arguments are equal, or
 107 // if they are both Chinese languages, both Norwegian languages, or
 108 // both Portuguese languages, as defined by IsChineseLanguage,
 109 // IsNorwegianLanguage, and IsPortugueseLanguage. Otherwise it returns
 110 // false.
 111 bool IsSameLanguage(Language lang1, Language lang2);
 112
 113
 114 // IsRightToLeftLanguage
 115 // ---------------------
 116 //
 117 // This function returns true if the language is only written right-to-left
 118 // (E.g., Hebrew, Arabic, Persian etc.)
 119 //
 120 // IMPORTANT NOTE: Technically we're talking about scripts, not languages.
 121 // There are languages that can be written in more than one script.
 122 // Examples:
 123 //   - Kurdish and Azeri ('AZERBAIJANI') can be written left-to-right in
 124 //     Latin or Cyrillic script, and right-to-left in Arabic script.
 125 //   - Sindhi and Punjabi are written in different scripts, depending on
 126 //     region and dialect.
 127 //   - Turkmen used an Arabic script historically, but not any more.
 128 //   - Pashto and Uyghur can use Arabic script, but use a Roman script
 129 //     on the Internet.
 130 //   - Kashmiri and Urdu are written either with Arabic or Devanagari script.
 131 //
 132 // This function only returns true for languages that are always, unequivocally
 133 // written in right-to-left script.
 134 //
 135 // TODO(benjy): If we want to do anything special with multi-script languages
 136 // we should create new 'languages' for each language+script, as we do for
 137 // traditional vs. simplified Chinese. However most such languages are rare in
 138 // use and even rarer on the web, so this is unlikely to be something we'll
 139 // be concerned with for a while.
 140 bool IsRightToLeftLanguage(Language lang);
 141
 142 // IsMaybeRightToLeftLanguage
 143 // --------------------------
 144 //
 145 // This function returns true if the language may appear on the web in a
 146 // right-to-left script (E.g., Hebrew, Arabic, Persian, Urdu, Kurdish, etc.)
 147 //
 148 // NOTE: See important notes under IsRightToLeftLanguage(...).
 149 //
 150 // This function returns true for languages that *may* appear on the web in a
 151 // right-to-left script, even if they may also appear in a left-to-right
 152 // script.
 153 //
 154 // This function should typically be used in cases where doing some work on
 155 // left-to-right text would be OK (usually a no-op), and this function is used
 156 // just to cut down on unnecessary work on regular, LTR text.
 157 bool IsMaybeRightToLeftLanguage(Language lang);
 158
 159 // IsScriptRequiringLongerSnippets
 160 // --------------------
 161 //
 162 // This function returns true if the script chracteristics require longer
 163 // snippet length (Devanagari, Bengali, Gurmukhi,
 164 // Gujarati, Oriya, Tamil, Telugu, Kannada, Malayalam).
 165 // COMMENTED OUT TO REDUCE DEPENDENCIES ON GOOGLE3 CODE
 166 // bool IsScriptRequiringLongerSnippets(UnicodeScript script);
 167
 168
 169 // *******************************************
 170 // LANGUAGE NAMES
 171 //
 172 // This interface defines a standard name for each valid Language,
 173 // and a standard name for invalid languages. Some language names use all
 174 // uppercase letters, but others use mixed case.
 175 //   LanguageName() [Language to name]
 176 //   LanguageEnumName() [language to enum name]
 177 //   LanguageFromName() [name to Language]
 178 //   default_language_name()
 179 //   invalid_language_name()
 180 // *******************************************
 181
 182 // Given a Language, returns its standard name.
 183 // Return invalid_language_name() if the language is invalid.
 184 const char* LanguageName(Language lang);
 185
 186 // Given a Language, return the name of the enum constant for that
 187 // language. In all but a few cases, this is the same as its standard
 188 // name. For example, LanguageName(CHINESE) returns "Chinese", but
 189 // LanguageEnumName(CHINESE) returns "CHINESE". This is intended for
 190 // code that is generating C++ code, where the enum constant is more
 191 // useful than its integer value.  Return "NUM_LANGUAGES" if
 192 // the language is invalid.
 193 const char* LanguageEnumName(Language lang);
 194
 195 // The maximum length of a standard language name.
 196 const int kMaxLanguageNameSize = 50;
 197
 198 // The standard name for the default language.
 199 const char* default_language_name();
 200
 201 // The standard name for all invalid languages.
 202 const char* invalid_language_name();
 203
 204 // If lang_name matches the standard name of a Language, using a
 205 // case-insensitive comparison, set *language to that Language and
 206 // return true.
 207 // Otherwise, set *language to UNKNOWN_LANGUAGE and return false.
 208 //
 209 // For backwards compatibility, "HATIAN_CREOLE" is allowed as a name
 210 // for HAITIAN_CREOLE, and "QUECHAU" is allowed as a name for QUECHUA.
 211 // For compatibility with LanguageEnumName, "UNKNOWN_LANGUAGE" is allowed
 212 // as a name for UNKNOWN_LANGUAGE (the return value is true in this case,
 213 // as it is for "Unknown"), and "CHINESE_T" is allowed as a name for
 214 // CHINESE_T (i.e., a synonym for "ChineseT").
 215 //
 216 // REQUIRES: language must not be NULL.
 217 //
 218 bool LanguageFromName(const char* lang_name, Language *language);
 219
 220
 221
 222 // *******************************************
 223 // LANGUAGE CODES
 224 //
 225 // This interface defines a standard code for each valid language, and
 226 // a standard code for invalid languages. These are derived from ISO codes,
 227 // with some Google additions.
 228 //   LanguageCode()
 229 //   default_language_code()
 230 //   invalid_language_code()
 231 //   LanguageCodeWithDialects()
 232 //   LanguageCodeISO639_1()
 233 //   LanguageCodeISO639_2()
 234 // *******************************************
 235
 236 // Given a Language, return its standard code. There are Google-specific codes:
 237 //     For CHINESE_T, return "zh-TW".
 238 //     For TG_UNKNOWN_LANGUAGE, return "ut".
 239 //     For UNKNOWN_LANGUAGE, return "un".
 240 //     For PORTUGUESE_P, return "pt-PT".
 241 //     For PORTUGUESE_B, return "pt-BR".
 242 //     For LIMBU, return "sit-NP".
 243 //     For CHEROKEE, return "chr".
 244 //     For SYRIAC, return "syr".
 245 // Otherwise return the ISO 639-1 two-letter language code for lang.
 246 // If lang is invalid, return invalid_language_code().
 247 //
 248 // NOTE: See the note below about the codes for Chinese languages.
 249 //
 250 const char* LanguageCode(Language lang);
 251
 252 // The maximum length of a language code.
 253 const int kMaxLanguageCodeSize = 50;
 254
 255 // The standard code for the default language.
 256 const char* default_language_code();
 257
 258 // The standard code for all invalid languages.
 259 const char* invalid_language_code();
 260
 261
 262 // --------------------------------------------
 263 // NOTE: CHINESE LANGUAGE CODES
 264 //
 265 // There are three functions that return codes for Chinese languages.
 266 // LanguageCode(lang) and LanguageCodeWithDialects(lang) are defined here.
 267 // LanguageCode(lang, encoding) is defined in i18n/encodings.lang_enc.h.
 268 // The following list shows the different results.
 269 //
 270 // LanguageCode(CHINESE) returns "zh"
 271 // LanguageCode(CHINESE_T) returns "zh-TW".
 272 //
 273 // LanguageCodeWithDialects(CHINESE) returns "zh-CN".
 274 // LanguageCodeWithDialects(CHINESE_T) returns "zh-TW".
 275 //
 276 // LanguageCode(CHINESE_T, <any encoding>) returns "zh-TW".
 277 // LanguageCode(CHINESE, CHINESE_BIG5) returns "zh-TW".
 278 // LanguageCode(CHINESE, <any other encoding>) returns "zh-CN".
 279 //
 280 // --------------------------------------------
 281
 282 // LanguageCodeWithDialects
 283 // ------------------------
 284 //
 285 // If lang is CHINESE, return "zh-CN". Otherwise return LanguageCode(lang).
 286 const char* LanguageCodeWithDialects(Language lang);
 287
 288 // LanguageCodeISO639_1
 289 // --------------------
 290 //
 291 // Return the ISO 639-1 two-letter language code for lang.
 292 // Return invalid_language_code() if lang is invalid or does not have
 293 // an ISO 639-1 two-letter language code.
 294 const char* LanguageCodeISO639_1(Language lang);
 295
 296 // LanguageCodeISO639_2
 297 // --------------------
 298 //
 299 // Return the ISO 639-2 three-letter language for lang.
 300 // Return invalid_language_code() if lang is invalid or does not have
 301 // an ISO 639-2 three-letter language code.
 302 const char* LanguageCodeISO639_2(Language lang);
 303
 304 // LanguageFromCode
 305 // ----------------
 306 //
 307 // If lang_code matches the code for a Language, using a case-insensitive
 308 // comparison, set *lang to that Language and return true.
 309 // Otherwise, set *lang to UNKNOWN_LANGUAGE and return false.
 310 //
 311 // lang_code can be an ISO 639-1 (two-letter) code, an ISO 639-2
 312 // (three-letter) code, or a Google-specific code (see LanguageCode).
 313 //
 314 // Certain language-code aliases are also allowed:
 315 //   For "zh-cn" and "zh_cn", set *lang to CHINESE.
 316 //   For "zh-tw" and "zh_tw", set *lang to CHINESE_T.
 317 //   For "he", set *lang to HEBREW.
 318 //   For "in", set *lang to INDONESIAN.
 319 //   For "ji", set *lang to YIDDISH.
 320 //   For "fil", set *lang to TAGALOG.
 321 //
 322 // REQUIRES: 'lang' must not be NULL.
 323 bool LanguageFromCode(const char* lang_code, Language *language);
 324
 325
 326 // LanguageFromCodeOrName
 327 // ----------------------
 328 //
 329 // If lang_code_or_name is a language code or a language name.
 330 // set *language to the corresponding Language and return true.
 331 // Otherwise set *language to UNKNOWN_LANGUAGE and return false.
 332 //
 333 bool LanguageFromCodeOrName(const char* lang_code_or_name,
 334                             Language* language);
 335
 336 // LanguageNameFromCode
 337 // --------------------
 338 //
 339 // If language_code is the code for a Language (see LanguageFromCode),
 340 // return the standard name of that language (see LanguageName).
 341 // Otherwise return invalid_language_name().
 342 //
 343 const char* LanguageNameFromCode(const char* language_code);
 344
 345
 346 // Miscellany
 347
 348 // LanguageCodeToUnderscoreForm
 349 // ----------------------------
 350 //
 351 // Given a language code, convert the dash "-" to underscore "_".
 352 //
 353 // Specifically, if result_length <= strlen(lang_code), set result[0]
 354 // to '\0' and return false. Otherwise, copy lang_code to result,
 355 // converting every dash to an underscore, converting every character
 356 // before the first dash or underscore to lower case, and converting
 357 // every character after the first dash or underscore to upper
 358 // case. If there is no dash or underscore, convert the entire string
 359 // to lower case.
 360 //
 361 // REQUIRES: 'lang_code' must not be NULL. 'result' must not be NULL.
 362
 363 bool LanguageCodeToUnderscoreForm(const char* lang_code,
 364                                   char* result,
 365                                   int result_length);
 366
 367 //
 368 // AlwaysPutInExpectedRestrict
 369 // ---------------------------
 370 //
 371 // For Web pages in certain top-level domains, Web Search always
 372 // applies a "country restrict". If 'tld' matches one of those, using
 373 // a case-SENSITIVE comparison, set *expected_language to the Language
 374 // most commonly found in that top-level domain and return true.
 375 // Otherwise, set *expected_language to UNKNOWN_LANGUAGE and return false.
 376 bool AlwaysPutInExpectedRestrict(const char *tld, Language *expected_language);
 377
 378
 379 #endif  // LANGUAGES_PUBLIC_LANGUAGES_H_