third_party/cld/encodings/public/encodings.h

   1 // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #ifndef ENCODINGS_PUBLIC_ENCODINGS_H_
   6 #define ENCODINGS_PUBLIC_ENCODINGS_H_
   7
   8 // This interface defines the Encoding enum and various functions that
   9 // depend only on Encoding values.
  10
  11 // A hash-function for Encoding, hash<Encoding>, is defined in
  12 // i18n/encodings/public/encodings-hash.h
  13
  14 // On some Windows projects, UNICODE may be defined, which would prevent the
  15 // Encoding enum below from compiling. Note that this is a quick fix that does
  16 // not break any existing projects. The UNICODE enum may someday be changed
  17 // to something more specific and non-colliding, but this involves careful
  18 // testing of changes in many other projects.
  19 #undef UNICODE
  20
  21 // NOTE: The Encoding enum must always start at 0. This assumption has
  22 // been made and used.
  23
  24 #ifndef SWIG
  25
  26 #include "encodings/proto/encodings.pb.h"
  27
  28 // We must have this for compatibility.
  29 // COMMENTED OUT TO REDUCE DEPENDENCIES ON GOOGLE3 CODE
  30 //using namespace i18n::encodings;
  31
  32 #else
  33
  34 // Special proto SWIG workaround header file.
  35 #include "i18n/encodings/internal/encodings_proto_wrapper.h"
  36
  37 #endif
  38
  39 const int kNumEncodings = NUM_ENCODINGS;
  40
  41 // some of the popular encoding aliases
  42 // TODO(jrm) Make these static const Encoding values instead of macros.
  43 #define LATIN1           ISO_8859_1
  44 #define LATIN2           ISO_8859_2
  45 #define LATIN3           ISO_8859_3
  46 #define LATIN4           ISO_8859_4
  47 #define CYRILLIC         ISO_8859_5
  48 #define ARABIC_ENCODING  ISO_8859_6     // avoiding the same name as language
  49 #define GREEK_ENCODING   ISO_8859_7     // avoiding the same name as language
  50 #define HEBREW_ENCODING  ISO_8859_8     // avoiding the same name as language
  51 #define LATIN5           ISO_8859_9
  52 #define LATIN6           ISO_8859_10
  53 #define KOREAN_HANGUL    KOREAN_EUC_KR
  54
  55 // The default Encoding (LATIN1).
  56 Encoding default_encoding();
  57
  58
  59
  60 // *************************************************************
  61 // Encoding predicates
  62 //   IsValidEncoding()
  63 //   IsEncEncCompatible
  64 //   IsSupersetOfAscii7Bit
  65 //   Is8BitEncoding
  66 //   IsCJKEncoding
  67 //   IsHebrewEncoding
  68 //   IsRightToLeftEncoding
  69 //   IsLogicalRightToLeftEncoding
  70 //   IsVisualRightToLeftEncoding
  71 //   IsIso2022Encoding
  72 //   IsIso2022JpOrVariant
  73 //   IsShiftJisOrVariant
  74 //   IsJapaneseCellPhoneCarrierSpecificEncoding
  75 // *************************************************************
  76
  77 // IsValidEncoding
  78 // ===================================
  79 //
  80 // Function to check if the input language enum is within range.
  81 //
  82
  83 bool IsValidEncoding(Encoding enc);
  84
  85 //
  86 // IsEncEncCompatible
  87 // ------------------
  88 //
  89 // This function is to determine whether or not converting from the
  90 // first encoding to the second requires any changes to the underlying
  91 // text (e.g.  ASCII_7BIT is a subset of UTF8).
  92 //
  93 // TODO(someone more familiar with i18n): the current implementation
  94 // is likely incomplete.  It would be good to consider the full matrix
  95 // of all pairs of encodings and to fish out all compatible pairs.
  96 //
  97 bool IsEncEncCompatible(const Encoding from, const Encoding to);
  98
  99 // To be a superset of 7-bit Ascii means that bytes 0...127 in the given
 100 // encoding represent the same characters as they do in ISO_8859_1.
 101
 102 // WARNING: This function does not currently return true for all encodings that
 103 // are supersets of Ascii 7-bit.
 104 bool IsSupersetOfAscii7Bit(Encoding e);
 105
 106 // To be an 8-bit encoding means that there are fewer than 256 symbols.
 107 // Each byte determines a new character; there are no multi-byte sequences.
 108
 109 // WARNING: This function does not currently return true for all encodings that
 110 // are 8-bit encodings.
 111 bool Is8BitEncoding(Encoding e);
 112
 113 // IsCJKEncoding
 114 // -------------
 115 //
 116 // This function returns true if the encoding is either Chinese
 117 // (simplified or traditional), Japanese, or Korean. Note: UTF8 is not
 118 // considered a CJK encoding.
 119 bool IsCJKEncoding(Encoding e);
 120
 121 // IsHebrewEncoding
 122 // -------------
 123 //
 124 // This function returns true if the encoding is a Hebrew specific
 125 // encoding (not UTF8, etc).
 126 bool IsHebrewEncoding(Encoding e);
 127
 128 // IsRightToLeftEncoding
 129 // ---------------------
 130 //
 131 // Returns true if the encoding is a right-to-left encoding.
 132 //
 133 // Note that the name of this function is somewhat misleading. There is nothing
 134 // "right to left" about these encodings. They merely contain code points for
 135 // characters in RTL languages such as Hebrew and Arabic. But this is also
 136 // true for UTF-8.
 137 //
 138 // TODO(benjy): Get rid of this function. The only special-case we
 139 // should need to worry about are visual encodings. Anything we
 140 // need to do for all 'RTL' encodings we need to do for UTF-8 as well.
 141 bool IsRightToLeftEncoding(Encoding enc);
 142
 143 // IsLogicalRightToLeftEncoding
 144 // ----------------------------
 145 //
 146 // Returns true if the encoding is a logical right-to-left encoding.
 147 // Logical right-to-left encodings are those that the browser renders
 148 // right-to-left and applies the BiDi algorithm to. Therefore the characters
 149 // appear in reading order in the file, and indexing, snippet generation etc.
 150 // should all just work with no special processing.
 151 //
 152 // TODO(benjy): Get rid of this function. The only special-case we
 153 // should need to worry about are visual encodings.
 154 bool IsLogicalRightToLeftEncoding(Encoding enc);
 155
 156 // IsVisualRightToLeftEncoding
 157 // ---------------------------
 158 //
 159 // Returns true if the encoding is a visual right-to-left encoding.
 160 // Visual right-to-left encodings are those that the browser renders
 161 // left-to-right and does not apply the BiDi algorithm to. Therefore each
 162 // line appears in reverse order in the file, lines are manually wrapped
 163 // by abusing <br> or <p> tags, etc. Visual RTL encoding is a relic of
 164 // the prehistoric days when browsers couldn't render right-to-left, but
 165 // unfortunately some visual pages persist to this day. These documents require
 166 // special processing so that we don't index or snippet them with each line
 167 // reversed.
 168 bool IsVisualRightToLeftEncoding(Encoding enc);
 169
 170 // IsIso2022Encoding
 171 // -----------------
 172 //
 173 // Returns true if the encoding is a kind of ISO 2022 such as
 174 // ISO-2022-JP.
 175 bool IsIso2022Encoding(Encoding enc);
 176
 177 // IsIso2022JpOrVariant
 178 // --------------------
 179 //
 180 // Returns true if the encoding is ISO-2022-JP or a variant such as
 181 // KDDI's ISO-2022-JP.
 182 bool IsIso2022JpOrVariant(Encoding enc);
 183
 184 // IsShiftJisOrVariant
 185 // --------------------
 186 //
 187 // Returns true if the encoding is Shift_JIS or a variant such as
 188 // KDDI's Shift_JIS.
 189 bool IsShiftJisOrVariant(Encoding enc);
 190
 191 // IsJapanesCellPhoneCarrierSpecificEncoding
 192 // -----------------------------------------
 193 //
 194 // Returns true if it's Japanese cell phone carrier specific encoding
 195 // such as KDDI_SHIFT_JIS.
 196 bool IsJapaneseCellPhoneCarrierSpecificEncoding(Encoding enc);
 197
 198
 199
 200 // *************************************************************
 201 // ENCODING NAMES
 202 //
 203 // This interface defines a standard name for each valid encoding, and
 204 // a standard name for invalid encodings. (Some names use all upper
 205 // case, but others use mixed case.)
 206 //
 207 //   EncodingName() [Encoding to name]
 208 //   MimeEncodingName() [Encoding to name]
 209 //   EncodingFromName() [name to Encoding]
 210 //   EncodingNameAliasToEncoding() [name to Encoding]
 211 //   default_encoding_name()
 212 //   invalid_encoding_name()
 213 // *************************************************************
 214
 215 // EncodingName
 216 // ------------
 217 //
 218 // Given the encoding, returns its standard name.
 219 // Return invalid_encoding_name() if the encoding is invalid.
 220 //
 221 const char* EncodingName(Encoding enc);
 222
 223 //
 224 // MimeEncodingName
 225 // ----------------
 226 //
 227 // Return the "preferred MIME name" of an encoding.
 228 //
 229 // This name is suitable for using in HTTP headers, HTML tags,
 230 // and as the "charset" parameter of a MIME Content-Type.
 231 const char* MimeEncodingName(Encoding enc);
 232
 233
 234 // The maximum length of an encoding name
 235 const int kMaxEncodingNameSize = 50;
 236
 237 // The standard name of the default encoding.
 238 const char* default_encoding_name();
 239
 240 // The name used for an invalid encoding.
 241 const char* invalid_encoding_name();
 242
 243 // EncodingFromName
 244 // ----------------
 245 //
 246 // If enc_name matches the standard name of an Encoding, using a
 247 // case-insensitive comparison, set *encoding to that Encoding and
 248 // return true.  Otherwise set *encoding to UNKNOWN_ENCODING and
 249 // return false.
 250 //
 251 // REQUIRES: encoding must not be NULL.
 252 //
 253 bool EncodingFromName(const char* enc_name, Encoding *encoding);
 254
 255 //
 256 // EncodingNameAliasToEncoding
 257 // ---------------------------
 258 //
 259 // If enc_name matches the standard name or an alias of an Encoding,
 260 // using a case-insensitive comparison, return that
 261 // Encoding. Otherwise, return UNKNOWN_ENCODING.
 262 //
 263 // Aliases include most mime-encoding names (e.g., "ISO-8859-7" for
 264 // GREEK), alternate names (e.g., "cyrillic" for ISO_8859_5) and
 265 // common variations with hyphens and underscores (e.g., "koi8-u" and
 266 // "koi8u" for RUSSIAN_KOI8_R).
 267
 268 Encoding EncodingNameAliasToEncoding(const char *enc_name);
 269
 270
 271 // *************************************************************
 272 // Miscellany
 273 // *************************************************************
 274
 275 // PreferredWebOutputEncoding
 276 // --------------------------
 277 //
 278 // Some multi-byte encodings use byte values that coincide with the
 279 // ASCII codes for HTML syntax characters <>"&' and browsers like MSIE
 280 // can misinterpret these, as indicated in an external XSS report from
 281 // 2007-02-15. Here, we map these dangerous encodings to safer ones. We
 282 // also use UTF8 instead of encodings that we don't support in our
 283 // output, and we generally try to be conservative in what we send out.
 284 // Where the client asks for single- or double-byte encodings that are
 285 // not as common, we substitute a more common single- or double-byte
 286 // encoding, if there is one, thereby preserving the client's intent
 287 // to use less space than UTF-8. This also means that characters
 288 // outside the destination set will be converted to HTML NCRs (&#NNN;)
 289 // if requested.
 290 Encoding PreferredWebOutputEncoding(Encoding enc);
 291
 292
 293 // InitEncodings
 294 // -------------
 295 //
 296 // Ensures the encodings module has been initialized.  Normally this happens
 297 // during InitGoogle, but this allows access for scripts that don't
 298 // support InitGoogle.
 299 void InitEncodings();
 300
 301 #endif  // ENCODINGS_PUBLIC_ENCODINGS_H_