Utilities/cmxmlrpc/xmlrpc_utf8.c

   1 /* Copyright (C) 2001 by Eric Kidd. All rights reserved.
   2 **
   3 ** Redistribution and use in source and binary forms, with or without
   4 ** modification, are permitted provided that the following conditions
   5 ** are met:
   6 ** 1. Redistributions of source code must retain the above copyright
   7 **    notice, this list of conditions and the following disclaimer.
   8 ** 2. Redistributions in binary form must reproduce the above copyright
   9 **    notice, this list of conditions and the following disclaimer in the
  10 **    documentation and/or other materials provided with the distribution.
  11 ** 3. The name of the author may not be used to endorse or promote products
  12 **    derived from this software without specific prior written permission.
  13 **
  14 ** THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  15 ** ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  16 ** IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  17 ** ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  18 ** FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  19 ** DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  20 ** OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  21 ** HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  22 ** LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  23 ** OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  24 ** SUCH DAMAGE. */
  25
  26
  27 /*=========================================================================
  28 **  XML-RPC UTF-8 Utilities
  29 **=========================================================================
  30 **  Routines for validating, encoding and decoding UTF-8 data.  We try to
  31 **  be very, very strict about invalid UTF-8 data.
  32 **
  33 **  All of the code in this file assumes that your machine represents
  34 **  wchar_t as a 16-bit (or wider) character containing UCS-2 data.  If this
  35 **  assumption is incorrect, you may need to replace this file.
  36 **
  37 **  For lots of information on Unicode and UTF-8 decoding, see:
  38 **    http://www.cl.cam.ac.uk/~mgk25/unicode.html
  39 */
  40
  41 #include "xmlrpc_config.h"
  42
  43 #include "xmlrpc.h"
  44
  45 #ifdef HAVE_UNICODE_WCHAR
  46
  47 /*=========================================================================
  48 **  Tables and Constants
  49 **=========================================================================
  50 **  We use a variety of tables and constants to help decode and validate
  51 **  UTF-8 data.
  52 */
  53
  54 /* The number of bytes in a UTF-8 sequence starting with the character used
  55 ** as the array index.  A zero entry indicates an illegal initial byte.
  56 ** This table was generated using a Perl script and information from the
  57 ** UTF-8 standard.
  58 **
  59 ** Fredrik Lundh's UTF-8 decoder Python 2.0 uses a similar table.  But
  60 ** since Python 2.0 has the icky CNRI license, I regenerated this
  61 ** table from scratch and wrote my own decoder. */
  62 static unsigned char utf8_seq_length[256] = {
  63     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  64     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  65     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  66     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  67     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  68     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  69     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  70     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  71     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  72     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  73     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  74     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  75     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  76     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  77     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
  78     4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
  79 };
  80
  81 /* The minimum legal character value for a UTF-8 sequence of the given
  82 ** length.  We have to check this to avoid accepting "overlong" UTF-8
  83 ** sequences, which use more bytes than necessary to encode a given
  84 ** character.  Such sequences are commonly used by evil people to bypass
  85 ** filters and security checks.  This table is based on the UTF-8-test.txt
  86 ** file by Markus Kuhn <mkuhn@acm.org>. */
  87 static wchar_t utf8_min_char_for_length[4] = {
  88     0,          /* Length 0: Not used (meaningless) */
  89     0x0000,     /* Length 1: Not used (special-cased) */
  90     0x0080,     /* Length 2 */
  91     0x0800      /* Length 3 */
  92
  93 #if 0
  94     /* These are only useful on systems where wchar_t is 32-bits wide
  95     ** and supports full UCS-4. */
  96     0x00010000, /* Length 4 */
  97     0x00200000, /* Length 5 */
  98     0x04000000  /* Length 6 */
  99 #endif
 100 };
 101
 102 /* This is the maximum legal 16-byte (UCS-2) character.  Again, this
 103 ** information is based on UTF-8-test.txt. */
 104 #define UCS2_MAX_LEGAL_CHARACTER (0xFFFD)
 105
 106 /* First and last UTF-16 surrogate characters.  These are *not* legal UCS-2
 107 ** characters--they're used to code for UCS-4 characters when using
 108 ** UTF-16.  They should never appear in decoded UTF-8 data!  Again, these
 109 ** could hypothetically be used to bypass security measures on some machines.
 110 ** Based on UTF-8-test.txt. */
 111 #define UTF16_FIRST_SURROGATE (0xD800)
 112 #define UTF16_LAST_SURROGATE  (0xDFFF)
 113
 114 /* Is the character 'c' a UTF-8 continuation character? */
 115 #define IS_CONTINUATION(c) (((c) & 0xC0) == 0x80)
 116
 117 /* Maximum number of bytes needed to encode a supported character. */
 118 #define MAX_ENCODED_BYTES (3)
 119
 120
 121 /*=========================================================================
 122 **  decode_utf8
 123 **=========================================================================
 124 **  Internal routine which decodes (or validates) a UTF-8 string.
 125 **  To validate, set io_buff and out_buff_len to NULL.  To decode, allocate
 126 **  a sufficiently large buffer, pass it as io_buff, and pass a pointer as
 127 **  as out_buff_len.  The data will be written to the buffer, and the
 128 **  length to out_buff_len.
 129 **
 130 **  We assume that wchar_t holds a single UCS-2 character in native-endian
 131 **  byte ordering.
 132 */
 133
 134 static void
 135 decode_utf8(xmlrpc_env * const env,
 136             const char * const utf8_data,
 137             size_t       const utf8_len,
 138             wchar_t *    const io_buff,
 139             size_t *     const out_buff_len) {
 140
 141     size_t i, length, out_pos;
 142     char init, con1, con2;
 143     wchar_t wc;
 144
 145     XMLRPC_ASSERT_ENV_OK(env);
 146     XMLRPC_ASSERT_PTR_OK(utf8_data);
 147     XMLRPC_ASSERT((!io_buff && !out_buff_len) ||
 148                   (io_buff && out_buff_len));
 149
 150     /* Suppress GCC warning about possibly undefined variable. */
 151     wc = 0;
 152
 153     i = 0;
 154     out_pos = 0;
 155     while (i < utf8_len) {
 156         init = utf8_data[i];
 157         if ((init & 0x80) == 0x00) {
 158             /* Convert ASCII character to wide character. */
 159             wc = init;
 160             i++;
 161         } else {
 162             /* Look up the length of this UTF-8 sequence. */
 163             length = utf8_seq_length[(unsigned char) init];
 164
 165             /* Check to make sure we have enough bytes to convert. */
 166             if (i + length > utf8_len)
 167                 XMLRPC_FAIL(env, XMLRPC_INVALID_UTF8_ERROR,
 168                             "Truncated UTF-8 sequence");
 169
 170             /* Decode a multibyte UTF-8 sequence. */
 171             switch (length) {
 172             case 0:
 173                 XMLRPC_FAIL(env, XMLRPC_INVALID_UTF8_ERROR,
 174                             "Invalid UTF-8 initial byte");
 175
 176             case 2:
 177                 /* 110xxxxx 10xxxxxx */
 178                 con1 = utf8_data[i+1];
 179                 if (!IS_CONTINUATION(con1))
 180                     XMLRPC_FAIL(env, XMLRPC_INVALID_UTF8_ERROR,
 181                                 "UTF-8 sequence too short");
 182                 wc = ((((wchar_t) (init & 0x1F)) <<  6) |
 183                       (((wchar_t) (con1 & 0x3F))));
 184                 break;
 185
 186             case 3:
 187                 /* 1110xxxx 10xxxxxx 10xxxxxx */
 188                 con1 = utf8_data[i+1];
 189                 con2 = utf8_data[i+2];
 190                 if (!IS_CONTINUATION(con1) || !IS_CONTINUATION(con2))
 191                     XMLRPC_FAIL(env, XMLRPC_INVALID_UTF8_ERROR,
 192                                 "UTF-8 sequence too short");
 193                 wc = ((((wchar_t) (init & 0x0F)) << 12) |
 194                       (((wchar_t) (con1 & 0x3F)) <<  6) |
 195                       (((wchar_t) (con2 & 0x3F))));
 196                 break;
 197
 198             case 4:
 199                 /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
 200             case 5:
 201                 /* 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
 202             case 6:
 203                 /* 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
 204                 XMLRPC_FAIL(env, XMLRPC_INVALID_UTF8_ERROR,
 205                             "UCS-4 characters not supported");
 206
 207             default:
 208                 XMLRPC_ASSERT("Error in UTF-8 decoder tables");
 209             }
 210
 211             /* Advance to the end of the sequence. */
 212             i += length;
 213
 214             /* Check for illegal UCS-2 characters. */
 215             if (wc > UCS2_MAX_LEGAL_CHARACTER)
 216                 XMLRPC_FAIL(env, XMLRPC_INVALID_UTF8_ERROR,
 217                             "UCS-2 characters > U+FFFD are illegal");
 218
 219             /* Check for UTF-16 surrogates. */
 220             if (UTF16_FIRST_SURROGATE <= wc && wc <= UTF16_LAST_SURROGATE)
 221                 XMLRPC_FAIL(env, XMLRPC_INVALID_UTF8_ERROR,
 222                             "UTF-16 surrogates may not appear in UTF-8 data");
 223
 224             /* Check for overlong sequences. */
 225             if (wc < utf8_min_char_for_length[length])
 226                 XMLRPC_FAIL(env, XMLRPC_INVALID_UTF8_ERROR,
 227                             "Overlong UTF-8 sequence not allowed");
 228         }
 229
 230         /* If we have a buffer, write our character to it. */
 231         if (io_buff) {
 232             io_buff[out_pos++] = wc;
 233         }
 234     }
 235
 236     /* Record the number of characters we found. */
 237     if (out_buff_len)
 238         *out_buff_len = out_pos;
 239
 240  cleanup:
 241     if (env->fault_occurred) {
 242         if (out_buff_len)
 243             *out_buff_len = 0;
 244     }
 245 }
 246
 247
 248
 249 /*=========================================================================
 250 **  xmlrpc_validate_utf8
 251 **=========================================================================
 252 **  Make sure that a UTF-8 string is valid.
 253 */
 254
 255 void
 256 xmlrpc_validate_utf8 (xmlrpc_env * const env,
 257                       const char * const utf8_data,
 258                       size_t       const utf8_len) {
 259
 260     decode_utf8(env, utf8_data, utf8_len, NULL, NULL);
 261 }
 262
 263
 264 /*=========================================================================
 265 **  xmlrpc_utf8_to_wcs
 266 **=========================================================================
 267 **  Decode UTF-8 string to a "wide character string".  This function
 268 **  returns an xmlrpc_mem_block with an element type of wchar_t.  Don't
 269 **  try to intepret the block in a bytewise fashion--it won't work in
 270 **  any useful or portable fashion.
 271 */
 272
 273 xmlrpc_mem_block *xmlrpc_utf8_to_wcs (xmlrpc_env *env,
 274                                       char *utf8_data,
 275                                       size_t utf8_len)
 276 {
 277     xmlrpc_mem_block *output;
 278     size_t wcs_length;
 279
 280     /* Allocate a memory block large enough to hold any possible output.
 281     ** We assume that each byte of the input may decode to a whcar_t. */
 282     output = XMLRPC_TYPED_MEM_BLOCK_NEW(wchar_t, env, utf8_len);
 283     XMLRPC_FAIL_IF_FAULT(env);
 284
 285     /* Decode the UTF-8 data. */
 286     decode_utf8(env, utf8_data, utf8_len,
 287                 XMLRPC_TYPED_MEM_BLOCK_CONTENTS(wchar_t, output),
 288                 &wcs_length);
 289     XMLRPC_FAIL_IF_FAULT(env);
 290
 291     /* Make sure we didn't overrun our buffer. */
 292     XMLRPC_ASSERT(wcs_length <= utf8_len);
 293
 294     /* Correct the length of the memory block. */
 295     XMLRPC_TYPED_MEM_BLOCK_RESIZE(wchar_t, env, output, wcs_length);
 296     XMLRPC_FAIL_IF_FAULT(env);
 297
 298  cleanup:
 299     if (env->fault_occurred) {
 300         if (output)
 301             xmlrpc_mem_block_free(output);
 302         return NULL;
 303     }
 304     return output;
 305 }
 306
 307
 308 /*=========================================================================
 309 **  xmlrpc_utf8_to_wcs
 310 **=========================================================================
 311 **  Encode a "wide character string" as UTF-8.
 312 */
 313
 314 xmlrpc_mem_block *xmlrpc_wcs_to_utf8 (xmlrpc_env *env,
 315                                       wchar_t *wcs_data,
 316                                       size_t wcs_len)
 317 {
 318     size_t estimate, bytes_used, i;
 319     xmlrpc_mem_block *output;
 320     unsigned char *buffer;
 321     wchar_t wc;
 322     int cwc;
 323
 324     XMLRPC_ASSERT_ENV_OK(env);
 325     XMLRPC_ASSERT_PTR_OK(wcs_data);
 326
 327     /* Allocate a memory block large enough to hold any possible output.
 328     ** We assume that every wchar might encode to the maximum length. */
 329     estimate = wcs_len * MAX_ENCODED_BYTES;
 330     output = XMLRPC_TYPED_MEM_BLOCK_NEW(char, env, estimate);
 331     XMLRPC_FAIL_IF_FAULT(env);
 332
 333     /* Output our characters. */
 334     buffer = (unsigned char*) XMLRPC_TYPED_MEM_BLOCK_CONTENTS(char, output);
 335     bytes_used = 0;
 336     for (i = 0; i < wcs_len; i++) {
 337         wc = wcs_data[i];
 338         cwc = wc;
 339         if (cwc <= 0x007F) {
 340             buffer[bytes_used++] = wc & 0x7F;
 341         } else if (cwc <= 0x07FF) {
 342             /* 110xxxxx 10xxxxxx */
 343             buffer[bytes_used++] = 0xC0 | (wc >> 6);
 344             buffer[bytes_used++] = 0x80 | (wc & 0x3F);
 345         } else if (cwc <= 0xFFFF) {
 346             /* 1110xxxx 10xxxxxx 10xxxxxx */
 347             buffer[bytes_used++] = 0xE0 | (wc >> 12);
 348             buffer[bytes_used++] = 0x80 | ((wc >> 6) & 0x3F);
 349             buffer[bytes_used++] = 0x80 | (wc & 0x3F);
 350         } else {
 351             XMLRPC_FAIL(env, XMLRPC_INTERNAL_ERROR,
 352                         "Don't know how to encode UCS-4 characters yet");
 353         }
 354     }
 355
 356     /* Make sure we didn't overrun our buffer. */
 357     XMLRPC_ASSERT(bytes_used <= estimate);
 358
 359     /* Correct the length of the memory block. */
 360     XMLRPC_TYPED_MEM_BLOCK_RESIZE(char, env, output, bytes_used);
 361     XMLRPC_FAIL_IF_FAULT(env);
 362
 363  cleanup:
 364     if (env->fault_occurred) {
 365         if (output)
 366             xmlrpc_mem_block_free(output);
 367         return NULL;
 368     }
 369     return output;
 370 }
 371
 372 #endif /* HAVE_UNICODE_WCHAR */