Revert of ui: Clean up damaged rects and clear them after painting. (patchset #2...
[chromium-blink-merge.git] / third_party / cld / encodings / public / encodings.h
blob3341f00ad24c9f8bb0c5703ec5118ac094f85b0f
1 // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #ifndef ENCODINGS_PUBLIC_ENCODINGS_H_
6 #define ENCODINGS_PUBLIC_ENCODINGS_H_
8 // This interface defines the Encoding enum and various functions that
9 // depend only on Encoding values.
11 // A hash-function for Encoding, hash<Encoding>, is defined in
12 // i18n/encodings/public/encodings-hash.h
14 // On some Windows projects, UNICODE may be defined, which would prevent the
15 // Encoding enum below from compiling. Note that this is a quick fix that does
16 // not break any existing projects. The UNICODE enum may someday be changed
17 // to something more specific and non-colliding, but this involves careful
18 // testing of changes in many other projects.
19 #undef UNICODE
21 // NOTE: The Encoding enum must always start at 0. This assumption has
22 // been made and used.
24 #ifndef SWIG
26 #include "encodings/proto/encodings.pb.h"
28 // We must have this for compatibility.
29 // COMMENTED OUT TO REDUCE DEPENDENCIES ON GOOGLE3 CODE
30 //using namespace i18n::encodings;
32 #else
34 // Special proto SWIG workaround header file.
35 #include "i18n/encodings/internal/encodings_proto_wrapper.h"
37 #endif
39 const int kNumEncodings = NUM_ENCODINGS;
41 // some of the popular encoding aliases
42 // TODO(jrm) Make these static const Encoding values instead of macros.
43 #define LATIN1 ISO_8859_1
44 #define LATIN2 ISO_8859_2
45 #define LATIN3 ISO_8859_3
46 #define LATIN4 ISO_8859_4
47 #define CYRILLIC ISO_8859_5
48 #define ARABIC_ENCODING ISO_8859_6 // avoiding the same name as language
49 #define GREEK_ENCODING ISO_8859_7 // avoiding the same name as language
50 #define HEBREW_ENCODING ISO_8859_8 // avoiding the same name as language
51 #define LATIN5 ISO_8859_9
52 #define LATIN6 ISO_8859_10
53 #define KOREAN_HANGUL KOREAN_EUC_KR
55 // The default Encoding (LATIN1).
56 Encoding default_encoding();
60 // *************************************************************
61 // Encoding predicates
62 // IsValidEncoding()
63 // IsEncEncCompatible
64 // IsSupersetOfAscii7Bit
65 // Is8BitEncoding
66 // IsCJKEncoding
67 // IsHebrewEncoding
68 // IsRightToLeftEncoding
69 // IsLogicalRightToLeftEncoding
70 // IsVisualRightToLeftEncoding
71 // IsIso2022Encoding
72 // IsIso2022JpOrVariant
73 // IsShiftJisOrVariant
74 // IsJapaneseCellPhoneCarrierSpecificEncoding
75 // *************************************************************
77 // IsValidEncoding
78 // ===================================
80 // Function to check if the input language enum is within range.
83 bool IsValidEncoding(Encoding enc);
86 // IsEncEncCompatible
87 // ------------------
89 // This function is to determine whether or not converting from the
90 // first encoding to the second requires any changes to the underlying
91 // text (e.g. ASCII_7BIT is a subset of UTF8).
93 // TODO(someone more familiar with i18n): the current implementation
94 // is likely incomplete. It would be good to consider the full matrix
95 // of all pairs of encodings and to fish out all compatible pairs.
97 bool IsEncEncCompatible(const Encoding from, const Encoding to);
99 // To be a superset of 7-bit Ascii means that bytes 0...127 in the given
100 // encoding represent the same characters as they do in ISO_8859_1.
102 // WARNING: This function does not currently return true for all encodings that
103 // are supersets of Ascii 7-bit.
104 bool IsSupersetOfAscii7Bit(Encoding e);
106 // To be an 8-bit encoding means that there are fewer than 256 symbols.
107 // Each byte determines a new character; there are no multi-byte sequences.
109 // WARNING: This function does not currently return true for all encodings that
110 // are 8-bit encodings.
111 bool Is8BitEncoding(Encoding e);
113 // IsCJKEncoding
114 // -------------
116 // This function returns true if the encoding is either Chinese
117 // (simplified or traditional), Japanese, or Korean. Note: UTF8 is not
118 // considered a CJK encoding.
119 bool IsCJKEncoding(Encoding e);
121 // IsHebrewEncoding
122 // -------------
124 // This function returns true if the encoding is a Hebrew specific
125 // encoding (not UTF8, etc).
126 bool IsHebrewEncoding(Encoding e);
128 // IsRightToLeftEncoding
129 // ---------------------
131 // Returns true if the encoding is a right-to-left encoding.
133 // Note that the name of this function is somewhat misleading. There is nothing
134 // "right to left" about these encodings. They merely contain code points for
135 // characters in RTL languages such as Hebrew and Arabic. But this is also
136 // true for UTF-8.
138 // TODO(benjy): Get rid of this function. The only special-case we
139 // should need to worry about are visual encodings. Anything we
140 // need to do for all 'RTL' encodings we need to do for UTF-8 as well.
141 bool IsRightToLeftEncoding(Encoding enc);
143 // IsLogicalRightToLeftEncoding
144 // ----------------------------
146 // Returns true if the encoding is a logical right-to-left encoding.
147 // Logical right-to-left encodings are those that the browser renders
148 // right-to-left and applies the BiDi algorithm to. Therefore the characters
149 // appear in reading order in the file, and indexing, snippet generation etc.
150 // should all just work with no special processing.
152 // TODO(benjy): Get rid of this function. The only special-case we
153 // should need to worry about are visual encodings.
154 bool IsLogicalRightToLeftEncoding(Encoding enc);
156 // IsVisualRightToLeftEncoding
157 // ---------------------------
159 // Returns true if the encoding is a visual right-to-left encoding.
160 // Visual right-to-left encodings are those that the browser renders
161 // left-to-right and does not apply the BiDi algorithm to. Therefore each
162 // line appears in reverse order in the file, lines are manually wrapped
163 // by abusing <br> or <p> tags, etc. Visual RTL encoding is a relic of
164 // the prehistoric days when browsers couldn't render right-to-left, but
165 // unfortunately some visual pages persist to this day. These documents require
166 // special processing so that we don't index or snippet them with each line
167 // reversed.
168 bool IsVisualRightToLeftEncoding(Encoding enc);
170 // IsIso2022Encoding
171 // -----------------
173 // Returns true if the encoding is a kind of ISO 2022 such as
174 // ISO-2022-JP.
175 bool IsIso2022Encoding(Encoding enc);
177 // IsIso2022JpOrVariant
178 // --------------------
180 // Returns true if the encoding is ISO-2022-JP or a variant such as
181 // KDDI's ISO-2022-JP.
182 bool IsIso2022JpOrVariant(Encoding enc);
184 // IsShiftJisOrVariant
185 // --------------------
187 // Returns true if the encoding is Shift_JIS or a variant such as
188 // KDDI's Shift_JIS.
189 bool IsShiftJisOrVariant(Encoding enc);
191 // IsJapanesCellPhoneCarrierSpecificEncoding
192 // -----------------------------------------
194 // Returns true if it's Japanese cell phone carrier specific encoding
195 // such as KDDI_SHIFT_JIS.
196 bool IsJapaneseCellPhoneCarrierSpecificEncoding(Encoding enc);
200 // *************************************************************
201 // ENCODING NAMES
203 // This interface defines a standard name for each valid encoding, and
204 // a standard name for invalid encodings. (Some names use all upper
205 // case, but others use mixed case.)
207 // EncodingName() [Encoding to name]
208 // MimeEncodingName() [Encoding to name]
209 // EncodingFromName() [name to Encoding]
210 // EncodingNameAliasToEncoding() [name to Encoding]
211 // default_encoding_name()
212 // invalid_encoding_name()
213 // *************************************************************
215 // EncodingName
216 // ------------
218 // Given the encoding, returns its standard name.
219 // Return invalid_encoding_name() if the encoding is invalid.
221 const char* EncodingName(Encoding enc);
224 // MimeEncodingName
225 // ----------------
227 // Return the "preferred MIME name" of an encoding.
229 // This name is suitable for using in HTTP headers, HTML tags,
230 // and as the "charset" parameter of a MIME Content-Type.
231 const char* MimeEncodingName(Encoding enc);
234 // The maximum length of an encoding name
235 const int kMaxEncodingNameSize = 50;
237 // The standard name of the default encoding.
238 const char* default_encoding_name();
240 // The name used for an invalid encoding.
241 const char* invalid_encoding_name();
243 // EncodingFromName
244 // ----------------
246 // If enc_name matches the standard name of an Encoding, using a
247 // case-insensitive comparison, set *encoding to that Encoding and
248 // return true. Otherwise set *encoding to UNKNOWN_ENCODING and
249 // return false.
251 // REQUIRES: encoding must not be NULL.
253 bool EncodingFromName(const char* enc_name, Encoding *encoding);
256 // EncodingNameAliasToEncoding
257 // ---------------------------
259 // If enc_name matches the standard name or an alias of an Encoding,
260 // using a case-insensitive comparison, return that
261 // Encoding. Otherwise, return UNKNOWN_ENCODING.
263 // Aliases include most mime-encoding names (e.g., "ISO-8859-7" for
264 // GREEK), alternate names (e.g., "cyrillic" for ISO_8859_5) and
265 // common variations with hyphens and underscores (e.g., "koi8-u" and
266 // "koi8u" for RUSSIAN_KOI8_R).
268 Encoding EncodingNameAliasToEncoding(const char *enc_name);
271 // *************************************************************
272 // Miscellany
273 // *************************************************************
275 // PreferredWebOutputEncoding
276 // --------------------------
278 // Some multi-byte encodings use byte values that coincide with the
279 // ASCII codes for HTML syntax characters <>"&' and browsers like MSIE
280 // can misinterpret these, as indicated in an external XSS report from
281 // 2007-02-15. Here, we map these dangerous encodings to safer ones. We
282 // also use UTF8 instead of encodings that we don't support in our
283 // output, and we generally try to be conservative in what we send out.
284 // Where the client asks for single- or double-byte encodings that are
285 // not as common, we substitute a more common single- or double-byte
286 // encoding, if there is one, thereby preserving the client's intent
287 // to use less space than UTF-8. This also means that characters
288 // outside the destination set will be converted to HTML NCRs (&#NNN;)
289 // if requested.
290 Encoding PreferredWebOutputEncoding(Encoding enc);
293 // InitEncodings
294 // -------------
296 // Ensures the encodings module has been initialized. Normally this happens
297 // during InitGoogle, but this allows access for scripts that don't
298 // support InitGoogle.
299 void InitEncodings();
301 #endif // ENCODINGS_PUBLIC_ENCODINGS_H_