third_party/cld/encodings/proto/encodings.pb.h

   1 // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #ifndef ENCODINGS_PROTO_ENCODINGS_PB_H_
   6 #define ENCODINGS_PROTO_ENCODINGS_PB_H_
   7
   8 enum Encoding {
   9   ISO_8859_1           =  0,  // Teragram ASCII
  10   ISO_8859_2           =  1,  // Teragram Latin2
  11   ISO_8859_3           =  2,  // in BasisTech but not in Teragram
  12   ISO_8859_4           =  3,  // Teragram Latin4
  13   ISO_8859_5           =  4,  // Teragram ISO-8859-5
  14   ISO_8859_6           =  5,  // Teragram Arabic
  15   ISO_8859_7           =  6,  // Teragram Greek
  16   ISO_8859_8           =  7,  // Teragram Hebrew
  17   ISO_8859_9           =  8,  // in BasisTech but not in Teragram
  18   ISO_8859_10          =  9,  // in BasisTech but not in Teragram
  19   JAPANESE_EUC_JP      = 10,  // Teragram EUC_JP
  20   JAPANESE_SHIFT_JIS   = 11,  // Teragram SJS
  21   JAPANESE_JIS         = 12,  // Teragram JIS
  22   CHINESE_BIG5         = 13,  // Teragram BIG5
  23   CHINESE_GB           = 14,  // Teragram GB
  24   CHINESE_EUC_CN       = 15,  // Misnamed. Should be EUC_TW. Was Basis Tech
  25                               // CNS11643EUC, before that Teragram EUC-CN(!)
  26                               // See //i18n/basistech/basistech_encodings.h
  27   KOREAN_EUC_KR        = 16,  // Teragram KSC
  28   UNICODE              = 17,  // Teragram Unicode
  29   CHINESE_EUC_DEC      = 18,  // Misnamed. Should be EUC_TW. Was Basis Tech
  30                               // CNS11643EUC, before that Teragram EUC.
  31   CHINESE_CNS          = 19,  // Misnamed. Should be EUC_TW. Was Basis Tech
  32                               // CNS11643EUC, before that Teragram CNS.
  33   CHINESE_BIG5_CP950   = 20,  // Teragram BIG5_CP950
  34   JAPANESE_CP932       = 21,  // Teragram CP932
  35   UTF8                 = 22,
  36   UNKNOWN_ENCODING     = 23,
  37   ASCII_7BIT           = 24,  // ISO_8859_1 with all characters <= 127.
  38                               // Should be present only in the crawler
  39                               // and in the repository,
  40                               // *never* as a result of Document::encoding().
  41   RUSSIAN_KOI8_R       = 25,  // Teragram KOI8R
  42   RUSSIAN_CP1251       = 26,  // Teragram CP1251
  43
  44   //----------------------------------------------------------
  45   // These are _not_ output from teragram. Instead, they are as
  46   // detected in the headers of usenet articles.
  47   MSFT_CP1252          = 27,  // 27: CP1252 aka MSFT euro ascii
  48   RUSSIAN_KOI8_RU      = 28,  // CP21866 aka KOI8-U, used for Ukrainian.
  49                               // Misnamed, this is _not_ KOI8-RU but KOI8-U.
  50                               // KOI8-U is used much more often than KOI8-RU.
  51   MSFT_CP1250          = 29,  // CP1250 aka MSFT eastern european
  52   ISO_8859_15          = 30,  // aka ISO_8859_0 aka ISO_8859_1 euroized
  53   //----------------------------------------------------------
  54
  55   //----------------------------------------------------------
  56   // These are in BasisTech but not in Teragram. They are
  57   // needed for new interface languages. Now detected by
  58   // research langid
  59   MSFT_CP1254          = 31,  // used for Turkish
  60   MSFT_CP1257          = 32,  // used in Baltic countries
  61   //----------------------------------------------------------
  62
  63   //----------------------------------------------------------
  64   //----------------------------------------------------------
  65   // New encodings detected by Teragram
  66   ISO_8859_11          = 33,  // aka TIS-620, used for Thai
  67   MSFT_CP874           = 34,  // used for Thai
  68   MSFT_CP1256          = 35,  // used for Arabic
  69
  70   //----------------------------------------------------------
  71   // Detected as ISO_8859_8 by Teragram, but can be found in META tags
  72   MSFT_CP1255          = 36,  // Logical Hebrew Microsoft
  73   ISO_8859_8_I         = 37,  // Iso Hebrew Logical
  74   HEBREW_VISUAL        = 38,  // Iso Hebrew Visual
  75   //----------------------------------------------------------
  76
  77   //----------------------------------------------------------
  78   // Detected by research langid
  79   CZECH_CP852          = 39,
  80   CZECH_CSN_369103     = 40,  // aka ISO_IR_139 aka KOI8_CS
  81   MSFT_CP1253          = 41,  // used for Greek
  82   RUSSIAN_CP866        = 42,
  83   //----------------------------------------------------------
  84
  85   //----------------------------------------------------------
  86   // Handled by iconv in glibc
  87   ISO_8859_13          = 43,
  88   ISO_2022_KR          = 44,
  89   GBK                  = 45,
  90   GB18030              = 46,
  91   BIG5_HKSCS           = 47,
  92   ISO_2022_CN          = 48,
  93
  94   //-----------------------------------------------------------
  95   // Detected by xin liu's detector
  96   // Handled by transcoder
  97   // (Indic encodings)
  98
  99   TSCII                = 49,
 100   TAMIL_MONO           = 50,
 101   TAMIL_BI             = 51,
 102   JAGRAN               = 52,
 103
 104
 105   MACINTOSH_ROMAN      = 53,
 106   UTF7                 = 54,
 107   BHASKAR              = 55,  // Indic encoding - Devanagari
 108   HTCHANAKYA           = 56,  // 56 Indic encoding - Devanagari
 109
 110   //-----------------------------------------------------------
 111   // These allow a single place (inputconverter and outputconverter)
 112   // to do UTF-16 <==> UTF-8 bulk conversions and UTF-32 <==> UTF-8
 113   // bulk conversions, with interchange-valid checking on input and
 114   // fallback if needed on ouput.
 115   UTF16BE              = 57,  // big-endian UTF-16
 116   UTF16LE              = 58,  // little-endian UTF-16
 117   UTF32BE              = 59,  // big-endian UTF-32
 118   UTF32LE              = 60,  // little-endian UTF-32
 119   //-----------------------------------------------------------
 120
 121   //-----------------------------------------------------------
 122   // An encoding that means "This is not text, but it may have some
 123   // simple ASCII text embedded". Intended input conversion (not yet
 124   // implemented) is to keep strings of >=4 seven-bit ASCII characters
 125   // (follow each kept string with an ASCII space), delete the rest of
 126   // the bytes. This will pick up and allow indexing of e.g. captions
 127   // in JPEGs. No output conversion needed.
 128   BINARYENC            = 61,
 129   //-----------------------------------------------------------
 130
 131   //-----------------------------------------------------------
 132   // Some Web pages allow a mixture of HZ-GB and GB-2312 by using
 133   // ~{ ... ~} for 2-byte pairs, and the browsers support this.
 134   HZ_GB_2312           = 62,
 135   //-----------------------------------------------------------
 136
 137   //-----------------------------------------------------------
 138   // Some external vendors make the common input error of
 139   // converting MSFT_CP1252 to UTF8 *twice*. No output conversion needed.
 140   UTF8UTF8             = 63,
 141   //-----------------------------------------------------------
 142
 143   //-----------------------------------------------------------
 144   // Handled by transcoder for tamil language specific font
 145   // encodings without the support for detection at present.
 146   TAM_ELANGO           = 64,  // Elango - Tamil
 147   TAM_LTTMBARANI       = 65,  // Barani - Tamil
 148   TAM_SHREE            = 66,  // Shree - Tamil
 149   TAM_TBOOMIS          = 67,  // TBoomis - Tamil
 150   TAM_TMNEWS           = 68,  // TMNews - Tamil
 151   TAM_WEBTAMIL         = 69,  // Webtamil - Tamil
 152   //-----------------------------------------------------------
 153
 154   //-----------------------------------------------------------
 155   // Shift_JIS variants used by Japanese cell phone carriers.
 156   KDDI_SHIFT_JIS       = 70,
 157   DOCOMO_SHIFT_JIS     = 71,
 158   SOFTBANK_SHIFT_JIS   = 72,
 159   // ISO-2022-JP variants used by KDDI and SoftBank.
 160   KDDI_ISO_2022_JP     = 73,
 161   SOFTBANK_ISO_2022_JP = 74,
 162   //-----------------------------------------------------------
 163
 164   NUM_ENCODINGS        = 75,  // Always keep this at the end. It is not a
 165                               // valid Encoding enum, it is only used to
 166                               // indicate the total number of Encodings.
 167 };
 168
 169 #endif  // ENCODINGS_PROTO_ENCODINGS_PB_H_