third_party/cld/encodings/lang_enc.h

   1 // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 // This file is for i18n. It contains two enums, namely Language and
   6 // Encoding, where Language is the linguistic convention, and Encoding
   7 // contains information on both language encoding and character set.
   8 //
   9 // The language and encoding are both based on Teragram's conventions,
  10 // except for some common ISO-8859 encodings that are not detected by
  11 // Teragram but might be in the future.
  12 //
  13 // This file also includes functions that do mappings among
  14 // Language/Encoding enums, language/encoding string names (typically
  15 // the output from Language Encoding identifier), and language codes
  16 // (iso 639), and two-letter country codes (iso 3166)
  17 //
  18 // NOTE: Both Language and Encoding enums should always start from
  19 // zero value. This assumption has been made and used.
  20 //
  21
  22 #ifndef ENCODINGS_LANG_ENC_H__
  23 #define ENCODINGS_LANG_ENC_H__
  24
  25 #include "languages/public/languages.h"
  26 #include "encodings/public/encodings.h"
  27
  28
  29 // EncodingsForLanguage
  30 // --------------------
  31 //
  32 // Given the language, returns a pointer to an array of encodings this
  33 // language supports. Typically, the encs array has at least one
  34 // element: UNKNOWN_ENCODING, which is always the last element of the
  35 // array. The first encoding is the default encoding of the language.
  36 // Return NULL if the input is invalid.
  37 //
  38 // Note: The output encoding array does not include ASCII_7BIT, UTF8
  39 // or UNICODE which are good for all languages. TODO: Find out whether
  40 // it is better to include ASCII_7BIT, UTF8 and UNICODE or leave them
  41 // as special cases.
  42 //
  43 const Encoding* EncodingsForLanguage(Language lang);
  44
  45
  46 // DefaultEncodingForLanguage
  47 // --------------------------
  48 //
  49 // Given the language, returns the default encoding for the language
  50 // via the argument encoding.
  51 //
  52 // The function returns true if the input lang is valid. Otherwise,
  53 // false is returned, and encoding is set to UNKNOWN_ENCODING.
  54 //
  55 bool DefaultEncodingForLanguage(Language lang,
  56                                 Encoding *encoding);
  57
  58 // LanguagesForEncoding
  59 // --------------------
  60 //
  61 // Given the encoding, returns a pointer to an array of languages this
  62 // encoding supports. Typically, the langs array has at least one
  63 // element: UNKNOWN_LANGUAGE, which is always the last element of the
  64 // array. The first language in the array if the most popular
  65 // language for that encoding. NULL is returned if the input is
  66 // invalid.
  67 //
  68 // Note: For ASCII_7BIT, UNICODE and UTF8, only ENGLISH and
  69 // UNKNOWN_LANGUAGE are returned. TODO: Find out whether to return all
  70 // the languages or to treat these two encodings as special cases.
  71 //
  72 // For other known encodings, ENGLISH is always included. This is
  73 // because English (Latin) characters are included in each encoding.
  74 //
  75 const Language* LanguagesForEncoding(Encoding enc);
  76
  77 // DefaultLanguageForEncoding
  78 // --------------------------
  79 //
  80 // Given the encoding, returns the default language for that encoding
  81 // via the argument language.
  82 //
  83 // The function returns true if the input enc is valid. Otherwise,
  84 // false is returned, and language is set to UNKNOWN_LANGUAGE.
  85 //
  86 // Note, this function is more useful for the encodings that have only
  87 // one corresponding language i.e. shift_jis => Japanese. There are
  88 // cases that multiple langauges have the same encoding, for which the
  89 // default language is an arbitrary choice from them.
  90 //
  91 bool DefaultLanguageForEncoding(Encoding enc, Language* language);
  92
  93 //
  94 // IsLangEncCompatible
  95 // -------------------
  96 //
  97 // This function is to determine whether the input language and
  98 // encoding are compatible. For example, FRENCH and LATIN1 are
  99 // compatible, but FRENCH and GB are not.
 100 //
 101 // If either lang or enc is invalid return false.
 102 // If either lang is unknown, return true.
 103 //    (e.g. we can detect a page's encoding as latin1 from metatag info, but
 104 //     cannot derive it language since there are more than one
 105 //     language encoding in Latin1 )
 106 // If language is known, but encoding is unknown, return false.
 107 //    (return true will do us no good since we cannot convert to UTF8 anyway)
 108 // If enc is unicode or utf8, return true.
 109 // Otherwise check if lang is supported by enc and enc supported by
 110 // lang.
 111 //
 112 bool IsLangEncCompatible(Language lang, Encoding enc);
 113
 114 //
 115 // DominantLanguageFromEncoding
 116 // ----------------------------
 117 //
 118 // This function determine if there exists a dominant language for the
 119 // input encoding. For example, the encoding GB has a dominant
 120 // language (Chinese), but Latin1 does not.
 121 //
 122 // The word "dominant" is used here because English characters are
 123 // included in each encoding.
 124 //
 125 // If there is no dominant langauge for the encoding, such as Latin1,
 126 // UNKNOWN_LANGUAGE is returned.
 127 //
 128 Language DominantLanguageFromEncoding(Encoding enc);
 129
 130 // LanguageCode
 131 // ------------------------
 132 // Given the Language and Encoding, return language code with dialects
 133 // (>= 2 letters).  Encoding is necessary to disambiguate between
 134 // Simplified and Traditional Chinese.
 135 //
 136 // See the note on Chinese Language Codes in
 137 // i18n/languages/public/languages.h
 138 // for the details.
 139
 140 const char* LanguageCode(Language lang, Encoding enc);
 141
 142 //
 143 // IsEncodingWithSupportedLanguage()
 144 // ---------------------------------
 145 //
 146 // There are some encoding listed here just because they are commonly
 147 // used.  There is no interface language for them yet. They are not
 148 // detected by Teragram, but can be detected from the meta info of the
 149 // HTML page.
 150 //
 151 // For example, we have list ARABIC_ENCODING but there is no arabic in
 152 // the Language enum. If the user input an Arabic query from Google
 153 // main page, Netscape will just send the raw bytes to GWS, and GWS
 154 // will treat them as Latin1.  Therefore, there is no use to detect
 155 // ARABIC_ENCODING for indexing, since they will never match the
 156 // queries which are treated as Latin1 by GWS. On the contrary, if we
 157 // treat page with ARABIC_ENCODING as UNKNOWN_ENCODING, Google will
 158 // fall them through as Latin1 in indexing time. And there might be a
 159 // match for some ARABIC queries which are also treated as Latin1 by
 160 // GWS. In fact, some people are relying on this feature to do Arabic
 161 // searches.
 162 //
 163 // Thus for these type of encoding, before we have the UI support for
 164 // their language and have a pretty comprehensive language/encoding
 165 // identification quality, it is better to revert them as
 166 // UNKNOWN_ENCODING.
 167 //
 168 // This function checks whether the input encoding is one with
 169 // an interface language.
 170 bool IsEncodingWithSupportedLanguage(Encoding enc);
 171
 172
 173 //
 174 // LangsFromCountryCode and EncFromCountryCode
 175 // -------------------------------------------
 176 //
 177 // These two functions return the possible languages and encodings,
 178 // respectively, according to the input country code, which is a
 179 // 2-letter string. The country code is usually specified in the url
 180 // of a document.
 181 //
 182 //
 183
 184 // LangsFromCountryCode
 185 // --------------------
 186 //
 187 // This function takes a string of arbitrary length. It treats the
 188 // first 2 bytes of the string as the country code, as defined in iso
 189 // 3166-1993 (E).  It returns, via arguments, an array of the
 190 // languages that are popular in that country, roughly in order of
 191 // popularity, together with the size of the array.
 192 //
 193 // This function returns true if we have language information for
 194 // country_code.  Otherwise, it returns false.
 195 //
 196 bool LangsFromCountryCode(const char* country_code,
 197                           const Language** lang_arry,
 198                           int* num_langs);
 199
 200
 201 //
 202 // EncFromCountryCode
 203 // ------------------
 204 //
 205 // This function takes a string of arbitrary length. It treats the
 206 // first 2 bytes of that string as the country code, as defined in iso
 207 // 3166-1993 (E). It sets *enc to the encoding that is
 208 // most often used for the languages spoken in that country.
 209 //
 210 // This function returns true if we have encoding information for
 211 // country_code.  Otherwise, it returns false, and *enc is set to
 212 // UNKNOWN_ENCODING.
 213 //
 214 bool EncFromCountryCode(const char* country_code, Encoding* enc);
 215
 216
 217
 218 // VisualType
 219 // ----------
 220 //
 221 // Right-to-left documents may be in logical or visual order. When they
 222 // are in visual order we convert them to logical order before processing.
 223 // This enum lists the types of visual document we can encounter.
 224 // Some, but not all, documents in Hebrew/Arabic/Persian etc. will be visual.
 225 // The other documents in those languages, and all documents in non-RTL
 226 // languages, will be NOT_VISUAL_DOCUMENT.
 227 enum VisualType {
 228   NOT_VISUAL_DOCUMENT = 0,
 229   VISUAL_HEBREW_HTML,  // HTML documents in the legacy visual order.
 230   CONVERTED_RTL_PDF,   // Converted RTL PDFs, which are always visual.
 231 };
 232
 233 VisualType default_visualtype();
 234
 235 // VisualTypeName
 236 // --------------
 237 //
 238 // Given the visual type, returns a string name useful for debug output.
 239 const char* VisualTypeName(VisualType visualtype);
 240
 241
 242
 243 // InitLangEnc
 244 // -----------
 245 //
 246 // Ensures the LangEnc module has been initialized.  Normally this
 247 // happens during InitGoogle, but this allows access for scripts that
 248 // don't support InitGoogle. InitLangEnc calls InitEncodings (see
 249 // i18n/encodings/public/encodings.h) and also initializes data
 250 // structures used in lang_enc.cc.
 251 //
 252 void InitLangEnc();
 253
 254 #endif  // ENCODINGS_LANG_ENC_H__