external_libraries/boost/libs/detail/utf8_codecvt_facet.cpp

   1 /////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8
   2 // utf8_codecvt_facet.cpp
   3
   4 // Copyright (c) 2001 Ronald Garcia, Indiana University (garcia@osl.iu.edu)
   5 // Andrew Lumsdaine, Indiana University (lums@osl.iu.edu).
   6 // Use, modification and distribution is subject to the Boost Software
   7 // License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at
   8 // http://www.boost.org/LICENSE_1_0.txt)
   9
  10 // Please see the comments in <boost/detail/utf8_codecvt_facet.hpp> to
  11 // learn how this file should be used.
  12
  13 #include <boost/detail/utf8_codecvt_facet.hpp>
  14
  15 #include <cstdlib> // for multi-byte converson routines
  16 #include <cassert>
  17
  18 #include <boost/limits.hpp>
  19 #include <boost/config.hpp>
  20
  21 // If we don't have wstring, then Unicode support
  22 // is not available anyway, so we don't need to even
  23 // compiler this file. This also fixes the problem
  24 // with mingw, which can compile this file, but will
  25 // generate link error when building DLL.
  26 #ifndef BOOST_NO_STD_WSTRING
  27
  28 BOOST_UTF8_BEGIN_NAMESPACE
  29
  30 /////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8
  31 // implementation for wchar_t
  32
  33 // Translate incoming UTF-8 into UCS-4
  34 std::codecvt_base::result utf8_codecvt_facet::do_in(
  35     std::mbstate_t& /*state*/,
  36     const char * from,
  37     const char * from_end,
  38     const char * & from_next,
  39     wchar_t * to,
  40     wchar_t * to_end,
  41     wchar_t * & to_next
  42 ) const {
  43     // Basic algorithm:  The first octet determines how many
  44     // octets total make up the UCS-4 character.  The remaining
  45     // "continuing octets" all begin with "10". To convert, subtract
  46     // the amount that specifies the number of octets from the first
  47     // octet.  Subtract 0x80 (1000 0000) from each continuing octet,
  48     // then mash the whole lot together.  Note that each continuing
  49     // octet only uses 6 bits as unique values, so only shift by
  50     // multiples of 6 to combine.
  51     while (from != from_end && to != to_end) {
  52
  53         // Error checking   on the first octet
  54         if (invalid_leading_octet(*from)){
  55             from_next = from;
  56             to_next = to;
  57             return std::codecvt_base::error;
  58         }
  59
  60         // The first octet is   adjusted by a value dependent upon
  61         // the number   of "continuing octets" encoding the character
  62         const   int cont_octet_count = get_cont_octet_count(*from);
  63         const   wchar_t octet1_modifier_table[] =   {
  64             0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc
  65         };
  66
  67         // The unsigned char conversion is necessary in case char is
  68         // signed   (I learned this the hard way)
  69         wchar_t ucs_result =
  70             (unsigned char)(*from++) - octet1_modifier_table[cont_octet_count];
  71
  72         // Invariants   :
  73         //   1) At the start of the loop,   'i' continuing characters have been
  74         //    processed
  75         //   2) *from   points to the next continuing character to be processed.
  76         int i   = 0;
  77         while(i != cont_octet_count && from != from_end) {
  78
  79             // Error checking on continuing characters
  80             if (invalid_continuing_octet(*from)) {
  81                 from_next   = from;
  82                 to_next =   to;
  83                 return std::codecvt_base::error;
  84             }
  85
  86             ucs_result *= (1 << 6);
  87
  88             // each continuing character has an extra (10xxxxxx)b attached to
  89             // it that must be removed.
  90             ucs_result += (unsigned char)(*from++) - 0x80;
  91             ++i;
  92         }
  93
  94         // If   the buffer ends with an incomplete unicode character...
  95         if (from == from_end && i   != cont_octet_count) {
  96             // rewind "from" to before the current character translation
  97             from_next = from - (i+1);
  98             to_next = to;
  99             return std::codecvt_base::partial;
 100         }
 101         *to++   = ucs_result;
 102     }
 103     from_next = from;
 104     to_next = to;
 105
 106     // Were we done converting or did we run out of destination space?
 107     if(from == from_end) return std::codecvt_base::ok;
 108     else return std::codecvt_base::partial;
 109 }
 110
 111 std::codecvt_base::result utf8_codecvt_facet::do_out(
 112     std::mbstate_t& /*state*/,
 113     const wchar_t *   from,
 114     const wchar_t * from_end,
 115     const wchar_t * & from_next,
 116     char * to,
 117     char * to_end,
 118     char * & to_next
 119 ) const
 120 {
 121     // RG - consider merging this table with the other one
 122     const wchar_t octet1_modifier_table[] = {
 123         0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc
 124     };
 125
 126     wchar_t max_wchar = (std::numeric_limits<wchar_t>::max)();
 127     while (from != from_end && to != to_end) {
 128
 129         // Check for invalid UCS-4 character
 130         if (*from  > max_wchar) {
 131             from_next = from;
 132             to_next = to;
 133             return std::codecvt_base::error;
 134         }
 135
 136         int cont_octet_count = get_cont_octet_out_count(*from);
 137
 138         // RG  - comment this formula better
 139         int shift_exponent = (cont_octet_count) *   6;
 140
 141         // Process the first character
 142         *to++ = static_cast<char>(octet1_modifier_table[cont_octet_count] +
 143             (unsigned char)(*from / (1 << shift_exponent)));
 144
 145         // Process the continuation characters
 146         // Invariants: At   the start of the loop:
 147         //   1) 'i' continuing octets   have been generated
 148         //   2) '*to'   points to the next location to place an octet
 149         //   3) shift_exponent is   6 more than needed for the next octet
 150         int i   = 0;
 151         while   (i != cont_octet_count && to != to_end) {
 152             shift_exponent -= 6;
 153             *to++ = static_cast<char>(0x80 + ((*from / (1 << shift_exponent)) % (1 << 6)));
 154             ++i;
 155         }
 156         // If   we filled up the out buffer before encoding the character
 157         if(to   == to_end && i != cont_octet_count) {
 158             from_next = from;
 159             to_next = to - (i+1);
 160             return std::codecvt_base::partial;
 161         }
 162         ++from;
 163     }
 164     from_next = from;
 165     to_next = to;
 166     // Were we done or did we run out of destination space
 167     if(from == from_end) return std::codecvt_base::ok;
 168     else return std::codecvt_base::partial;
 169 }
 170
 171 // How many char objects can I process to get <= max_limit
 172 // wchar_t objects?
 173 int utf8_codecvt_facet::do_length(
 174     BOOST_CODECVT_DO_LENGTH_CONST std::mbstate_t &,
 175     const char * from,
 176     const char * from_end,
 177     std::size_t max_limit
 178 #if BOOST_WORKAROUND(__IBMCPP__, BOOST_TESTED_AT(600))
 179 ) const throw()
 180 #else
 181 ) const
 182 #endif
 183 {
 184     // RG - this code is confusing!  I need a better way to express it.
 185     // and test cases.
 186
 187     // Invariants:
 188     // 1) last_octet_count has the size of the last measured character
 189     // 2) char_count holds the number of characters shown to fit
 190     // within the bounds so far (no greater than max_limit)
 191     // 3) from_next points to the octet 'last_octet_count' before the
 192     // last measured character.
 193     int last_octet_count=0;
 194     std::size_t char_count = 0;
 195     const char* from_next = from;
 196     // Use "<" because the buffer may represent incomplete characters
 197     while (from_next+last_octet_count <= from_end && char_count <= max_limit) {
 198         from_next += last_octet_count;
 199         last_octet_count = (get_octet_count(*from_next));
 200         ++char_count;
 201     }
 202     return static_cast<int>(from_next-from_end);
 203 }
 204
 205 unsigned int utf8_codecvt_facet::get_octet_count(
 206     unsigned char   lead_octet
 207 ){
 208     // if the 0-bit (MSB) is 0, then 1 character
 209     if (lead_octet <= 0x7f) return 1;
 210
 211     // Otherwise the count number of consecutive 1 bits starting at MSB
 212 //    assert(0xc0 <= lead_octet && lead_octet <= 0xfd);
 213
 214     if (0xc0 <= lead_octet && lead_octet <= 0xdf) return 2;
 215     else if (0xe0 <= lead_octet && lead_octet <= 0xef) return 3;
 216     else if (0xf0 <= lead_octet && lead_octet <= 0xf7) return 4;
 217     else if (0xf8 <= lead_octet && lead_octet <= 0xfb) return 5;
 218     else return 6;
 219 }
 220 BOOST_UTF8_END_NAMESPACE
 221
 222 namespace {
 223 template<std::size_t s>
 224 int get_cont_octet_out_count_impl(wchar_t word){
 225     if (word < 0x80) {
 226         return 0;
 227     }
 228     if (word < 0x800) {
 229         return 1;
 230     }
 231     return 2;
 232 }
 233
 234 template<>
 235 int get_cont_octet_out_count_impl<4>(wchar_t word){
 236     if (word < 0x80) {
 237         return 0;
 238     }
 239     if (word < 0x800) {
 240         return 1;
 241     }
 242
 243     // Note that the following code will generate warnings on some platforms
 244     // where wchar_t is defined as UCS2.  The warnings are superfluous as the
 245     // specialization is never instantitiated with such compilers, but this
 246     // can cause problems if warnings are being treated as errors, so we guard
 247     // against that.  Including <boost/detail/utf8_codecvt_facet.hpp> as we do
 248     // should be enough to get WCHAR_MAX defined.
 249 #if !defined(WCHAR_MAX)
 250 #   error WCHAR_MAX not defined!
 251 #endif
 252     // cope with VC++ 7.1 or earlier having invalid WCHAR_MAX
 253 #if defined(_MSC_VER) && _MSC_VER <= 1310 // 7.1 or earlier
 254     return 2;
 255 #elif WCHAR_MAX > 0x10000
 256
 257    if (word < 0x10000) {
 258         return 2;
 259     }
 260     if (word < 0x200000) {
 261         return 3;
 262     }
 263     if (word < 0x4000000) {
 264         return 4;
 265     }
 266     return 5;
 267
 268 #else
 269     return 2;
 270 #endif
 271 }
 272
 273 } // namespace anonymous
 274
 275 BOOST_UTF8_BEGIN_NAMESPACE
 276 // How many "continuing octets" will be needed for this word
 277 // ==   total octets - 1.
 278 int utf8_codecvt_facet::get_cont_octet_out_count(
 279     wchar_t word
 280 ) const {
 281     return get_cont_octet_out_count_impl<sizeof(wchar_t)>(word);
 282 }
 283 BOOST_UTF8_END_NAMESPACE
 284
 285 #endif