1 // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #ifndef ENCODINGS_PROTO_ENCODINGS_PB_H_
6 #define ENCODINGS_PROTO_ENCODINGS_PB_H_
9 ISO_8859_1
= 0, // Teragram ASCII
10 ISO_8859_2
= 1, // Teragram Latin2
11 ISO_8859_3
= 2, // in BasisTech but not in Teragram
12 ISO_8859_4
= 3, // Teragram Latin4
13 ISO_8859_5
= 4, // Teragram ISO-8859-5
14 ISO_8859_6
= 5, // Teragram Arabic
15 ISO_8859_7
= 6, // Teragram Greek
16 ISO_8859_8
= 7, // Teragram Hebrew
17 ISO_8859_9
= 8, // in BasisTech but not in Teragram
18 ISO_8859_10
= 9, // in BasisTech but not in Teragram
19 JAPANESE_EUC_JP
= 10, // Teragram EUC_JP
20 JAPANESE_SHIFT_JIS
= 11, // Teragram SJS
21 JAPANESE_JIS
= 12, // Teragram JIS
22 CHINESE_BIG5
= 13, // Teragram BIG5
23 CHINESE_GB
= 14, // Teragram GB
24 CHINESE_EUC_CN
= 15, // Misnamed. Should be EUC_TW. Was Basis Tech
25 // CNS11643EUC, before that Teragram EUC-CN(!)
26 // See //i18n/basistech/basistech_encodings.h
27 KOREAN_EUC_KR
= 16, // Teragram KSC
28 UNICODE
= 17, // Teragram Unicode
29 CHINESE_EUC_DEC
= 18, // Misnamed. Should be EUC_TW. Was Basis Tech
30 // CNS11643EUC, before that Teragram EUC.
31 CHINESE_CNS
= 19, // Misnamed. Should be EUC_TW. Was Basis Tech
32 // CNS11643EUC, before that Teragram CNS.
33 CHINESE_BIG5_CP950
= 20, // Teragram BIG5_CP950
34 JAPANESE_CP932
= 21, // Teragram CP932
36 UNKNOWN_ENCODING
= 23,
37 ASCII_7BIT
= 24, // ISO_8859_1 with all characters <= 127.
38 // Should be present only in the crawler
39 // and in the repository,
40 // *never* as a result of Document::encoding().
41 RUSSIAN_KOI8_R
= 25, // Teragram KOI8R
42 RUSSIAN_CP1251
= 26, // Teragram CP1251
44 //----------------------------------------------------------
45 // These are _not_ output from teragram. Instead, they are as
46 // detected in the headers of usenet articles.
47 MSFT_CP1252
= 27, // 27: CP1252 aka MSFT euro ascii
48 RUSSIAN_KOI8_RU
= 28, // CP21866 aka KOI8-U, used for Ukrainian.
49 // Misnamed, this is _not_ KOI8-RU but KOI8-U.
50 // KOI8-U is used much more often than KOI8-RU.
51 MSFT_CP1250
= 29, // CP1250 aka MSFT eastern european
52 ISO_8859_15
= 30, // aka ISO_8859_0 aka ISO_8859_1 euroized
53 //----------------------------------------------------------
55 //----------------------------------------------------------
56 // These are in BasisTech but not in Teragram. They are
57 // needed for new interface languages. Now detected by
59 MSFT_CP1254
= 31, // used for Turkish
60 MSFT_CP1257
= 32, // used in Baltic countries
61 //----------------------------------------------------------
63 //----------------------------------------------------------
64 //----------------------------------------------------------
65 // New encodings detected by Teragram
66 ISO_8859_11
= 33, // aka TIS-620, used for Thai
67 MSFT_CP874
= 34, // used for Thai
68 MSFT_CP1256
= 35, // used for Arabic
70 //----------------------------------------------------------
71 // Detected as ISO_8859_8 by Teragram, but can be found in META tags
72 MSFT_CP1255
= 36, // Logical Hebrew Microsoft
73 ISO_8859_8_I
= 37, // Iso Hebrew Logical
74 HEBREW_VISUAL
= 38, // Iso Hebrew Visual
75 //----------------------------------------------------------
77 //----------------------------------------------------------
78 // Detected by research langid
80 CZECH_CSN_369103
= 40, // aka ISO_IR_139 aka KOI8_CS
81 MSFT_CP1253
= 41, // used for Greek
83 //----------------------------------------------------------
85 //----------------------------------------------------------
86 // Handled by iconv in glibc
94 //-----------------------------------------------------------
95 // Detected by xin liu's detector
96 // Handled by transcoder
105 MACINTOSH_ROMAN
= 53,
107 BHASKAR
= 55, // Indic encoding - Devanagari
108 HTCHANAKYA
= 56, // 56 Indic encoding - Devanagari
110 //-----------------------------------------------------------
111 // These allow a single place (inputconverter and outputconverter)
112 // to do UTF-16 <==> UTF-8 bulk conversions and UTF-32 <==> UTF-8
113 // bulk conversions, with interchange-valid checking on input and
114 // fallback if needed on ouput.
115 UTF16BE
= 57, // big-endian UTF-16
116 UTF16LE
= 58, // little-endian UTF-16
117 UTF32BE
= 59, // big-endian UTF-32
118 UTF32LE
= 60, // little-endian UTF-32
119 //-----------------------------------------------------------
121 //-----------------------------------------------------------
122 // An encoding that means "This is not text, but it may have some
123 // simple ASCII text embedded". Intended input conversion (not yet
124 // implemented) is to keep strings of >=4 seven-bit ASCII characters
125 // (follow each kept string with an ASCII space), delete the rest of
126 // the bytes. This will pick up and allow indexing of e.g. captions
127 // in JPEGs. No output conversion needed.
129 //-----------------------------------------------------------
131 //-----------------------------------------------------------
132 // Some Web pages allow a mixture of HZ-GB and GB-2312 by using
133 // ~{ ... ~} for 2-byte pairs, and the browsers support this.
135 //-----------------------------------------------------------
137 //-----------------------------------------------------------
138 // Some external vendors make the common input error of
139 // converting MSFT_CP1252 to UTF8 *twice*. No output conversion needed.
141 //-----------------------------------------------------------
143 //-----------------------------------------------------------
144 // Handled by transcoder for tamil language specific font
145 // encodings without the support for detection at present.
146 TAM_ELANGO
= 64, // Elango - Tamil
147 TAM_LTTMBARANI
= 65, // Barani - Tamil
148 TAM_SHREE
= 66, // Shree - Tamil
149 TAM_TBOOMIS
= 67, // TBoomis - Tamil
150 TAM_TMNEWS
= 68, // TMNews - Tamil
151 TAM_WEBTAMIL
= 69, // Webtamil - Tamil
152 //-----------------------------------------------------------
154 //-----------------------------------------------------------
155 // Shift_JIS variants used by Japanese cell phone carriers.
157 DOCOMO_SHIFT_JIS
= 71,
158 SOFTBANK_SHIFT_JIS
= 72,
159 // ISO-2022-JP variants used by KDDI and SoftBank.
160 KDDI_ISO_2022_JP
= 73,
161 SOFTBANK_ISO_2022_JP
= 74,
162 //-----------------------------------------------------------
164 NUM_ENCODINGS
= 75, // Always keep this at the end. It is not a
165 // valid Encoding enum, it is only used to
166 // indicate the total number of Encodings.
169 #endif // ENCODINGS_PROTO_ENCODINGS_PB_H_