ENH: make this work for older versions of OSX
[cmake.git] / Utilities / cmxmlrpc / xmlrpc_utf8.c
blob22223efffd3dc2f7978327e46def0cf5fa7a0ef6
1 /* Copyright (C) 2001 by Eric Kidd. All rights reserved.
2 **
3 ** Redistribution and use in source and binary forms, with or without
4 ** modification, are permitted provided that the following conditions
5 ** are met:
6 ** 1. Redistributions of source code must retain the above copyright
7 ** notice, this list of conditions and the following disclaimer.
8 ** 2. Redistributions in binary form must reproduce the above copyright
9 ** notice, this list of conditions and the following disclaimer in the
10 ** documentation and/or other materials provided with the distribution.
11 ** 3. The name of the author may not be used to endorse or promote products
12 ** derived from this software without specific prior written permission.
13 **
14 ** THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 ** ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 ** IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 ** ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 ** FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 ** DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 ** OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 ** HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 ** LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 ** OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 ** SUCH DAMAGE. */
27 /*=========================================================================
28 ** XML-RPC UTF-8 Utilities
29 **=========================================================================
30 ** Routines for validating, encoding and decoding UTF-8 data. We try to
31 ** be very, very strict about invalid UTF-8 data.
33 ** All of the code in this file assumes that your machine represents
34 ** wchar_t as a 16-bit (or wider) character containing UCS-2 data. If this
35 ** assumption is incorrect, you may need to replace this file.
37 ** For lots of information on Unicode and UTF-8 decoding, see:
38 ** http://www.cl.cam.ac.uk/~mgk25/unicode.html
41 #include "xmlrpc_config.h"
43 #include "xmlrpc.h"
45 #ifdef HAVE_UNICODE_WCHAR
47 /*=========================================================================
48 ** Tables and Constants
49 **=========================================================================
50 ** We use a variety of tables and constants to help decode and validate
51 ** UTF-8 data.
54 /* The number of bytes in a UTF-8 sequence starting with the character used
55 ** as the array index. A zero entry indicates an illegal initial byte.
56 ** This table was generated using a Perl script and information from the
57 ** UTF-8 standard.
59 ** Fredrik Lundh's UTF-8 decoder Python 2.0 uses a similar table. But
60 ** since Python 2.0 has the icky CNRI license, I regenerated this
61 ** table from scratch and wrote my own decoder. */
62 static unsigned char utf8_seq_length[256] = {
63 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
64 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
65 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
66 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
67 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
68 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
69 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
70 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
71 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
72 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
73 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
74 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
75 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
76 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
77 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
78 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
81 /* The minimum legal character value for a UTF-8 sequence of the given
82 ** length. We have to check this to avoid accepting "overlong" UTF-8
83 ** sequences, which use more bytes than necessary to encode a given
84 ** character. Such sequences are commonly used by evil people to bypass
85 ** filters and security checks. This table is based on the UTF-8-test.txt
86 ** file by Markus Kuhn <mkuhn@acm.org>. */
87 static wchar_t utf8_min_char_for_length[4] = {
88 0, /* Length 0: Not used (meaningless) */
89 0x0000, /* Length 1: Not used (special-cased) */
90 0x0080, /* Length 2 */
91 0x0800 /* Length 3 */
93 #if 0
94 /* These are only useful on systems where wchar_t is 32-bits wide
95 ** and supports full UCS-4. */
96 0x00010000, /* Length 4 */
97 0x00200000, /* Length 5 */
98 0x04000000 /* Length 6 */
99 #endif
102 /* This is the maximum legal 16-byte (UCS-2) character. Again, this
103 ** information is based on UTF-8-test.txt. */
104 #define UCS2_MAX_LEGAL_CHARACTER (0xFFFD)
106 /* First and last UTF-16 surrogate characters. These are *not* legal UCS-2
107 ** characters--they're used to code for UCS-4 characters when using
108 ** UTF-16. They should never appear in decoded UTF-8 data! Again, these
109 ** could hypothetically be used to bypass security measures on some machines.
110 ** Based on UTF-8-test.txt. */
111 #define UTF16_FIRST_SURROGATE (0xD800)
112 #define UTF16_LAST_SURROGATE (0xDFFF)
114 /* Is the character 'c' a UTF-8 continuation character? */
115 #define IS_CONTINUATION(c) (((c) & 0xC0) == 0x80)
117 /* Maximum number of bytes needed to encode a supported character. */
118 #define MAX_ENCODED_BYTES (3)
121 /*=========================================================================
122 ** decode_utf8
123 **=========================================================================
124 ** Internal routine which decodes (or validates) a UTF-8 string.
125 ** To validate, set io_buff and out_buff_len to NULL. To decode, allocate
126 ** a sufficiently large buffer, pass it as io_buff, and pass a pointer as
127 ** as out_buff_len. The data will be written to the buffer, and the
128 ** length to out_buff_len.
130 ** We assume that wchar_t holds a single UCS-2 character in native-endian
131 ** byte ordering.
134 static void
135 decode_utf8(xmlrpc_env * const env,
136 const char * const utf8_data,
137 size_t const utf8_len,
138 wchar_t * const io_buff,
139 size_t * const out_buff_len) {
141 size_t i, length, out_pos;
142 char init, con1, con2;
143 wchar_t wc;
145 XMLRPC_ASSERT_ENV_OK(env);
146 XMLRPC_ASSERT_PTR_OK(utf8_data);
147 XMLRPC_ASSERT((!io_buff && !out_buff_len) ||
148 (io_buff && out_buff_len));
150 /* Suppress GCC warning about possibly undefined variable. */
151 wc = 0;
153 i = 0;
154 out_pos = 0;
155 while (i < utf8_len) {
156 init = utf8_data[i];
157 if ((init & 0x80) == 0x00) {
158 /* Convert ASCII character to wide character. */
159 wc = init;
160 i++;
161 } else {
162 /* Look up the length of this UTF-8 sequence. */
163 length = utf8_seq_length[(unsigned char) init];
165 /* Check to make sure we have enough bytes to convert. */
166 if (i + length > utf8_len)
167 XMLRPC_FAIL(env, XMLRPC_INVALID_UTF8_ERROR,
168 "Truncated UTF-8 sequence");
170 /* Decode a multibyte UTF-8 sequence. */
171 switch (length) {
172 case 0:
173 XMLRPC_FAIL(env, XMLRPC_INVALID_UTF8_ERROR,
174 "Invalid UTF-8 initial byte");
176 case 2:
177 /* 110xxxxx 10xxxxxx */
178 con1 = utf8_data[i+1];
179 if (!IS_CONTINUATION(con1))
180 XMLRPC_FAIL(env, XMLRPC_INVALID_UTF8_ERROR,
181 "UTF-8 sequence too short");
182 wc = ((((wchar_t) (init & 0x1F)) << 6) |
183 (((wchar_t) (con1 & 0x3F))));
184 break;
186 case 3:
187 /* 1110xxxx 10xxxxxx 10xxxxxx */
188 con1 = utf8_data[i+1];
189 con2 = utf8_data[i+2];
190 if (!IS_CONTINUATION(con1) || !IS_CONTINUATION(con2))
191 XMLRPC_FAIL(env, XMLRPC_INVALID_UTF8_ERROR,
192 "UTF-8 sequence too short");
193 wc = ((((wchar_t) (init & 0x0F)) << 12) |
194 (((wchar_t) (con1 & 0x3F)) << 6) |
195 (((wchar_t) (con2 & 0x3F))));
196 break;
198 case 4:
199 /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
200 case 5:
201 /* 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
202 case 6:
203 /* 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
204 XMLRPC_FAIL(env, XMLRPC_INVALID_UTF8_ERROR,
205 "UCS-4 characters not supported");
207 default:
208 XMLRPC_ASSERT("Error in UTF-8 decoder tables");
211 /* Advance to the end of the sequence. */
212 i += length;
214 /* Check for illegal UCS-2 characters. */
215 if (wc > UCS2_MAX_LEGAL_CHARACTER)
216 XMLRPC_FAIL(env, XMLRPC_INVALID_UTF8_ERROR,
217 "UCS-2 characters > U+FFFD are illegal");
219 /* Check for UTF-16 surrogates. */
220 if (UTF16_FIRST_SURROGATE <= wc && wc <= UTF16_LAST_SURROGATE)
221 XMLRPC_FAIL(env, XMLRPC_INVALID_UTF8_ERROR,
222 "UTF-16 surrogates may not appear in UTF-8 data");
224 /* Check for overlong sequences. */
225 if (wc < utf8_min_char_for_length[length])
226 XMLRPC_FAIL(env, XMLRPC_INVALID_UTF8_ERROR,
227 "Overlong UTF-8 sequence not allowed");
230 /* If we have a buffer, write our character to it. */
231 if (io_buff) {
232 io_buff[out_pos++] = wc;
236 /* Record the number of characters we found. */
237 if (out_buff_len)
238 *out_buff_len = out_pos;
240 cleanup:
241 if (env->fault_occurred) {
242 if (out_buff_len)
243 *out_buff_len = 0;
249 /*=========================================================================
250 ** xmlrpc_validate_utf8
251 **=========================================================================
252 ** Make sure that a UTF-8 string is valid.
255 void
256 xmlrpc_validate_utf8 (xmlrpc_env * const env,
257 const char * const utf8_data,
258 size_t const utf8_len) {
260 decode_utf8(env, utf8_data, utf8_len, NULL, NULL);
264 /*=========================================================================
265 ** xmlrpc_utf8_to_wcs
266 **=========================================================================
267 ** Decode UTF-8 string to a "wide character string". This function
268 ** returns an xmlrpc_mem_block with an element type of wchar_t. Don't
269 ** try to intepret the block in a bytewise fashion--it won't work in
270 ** any useful or portable fashion.
273 xmlrpc_mem_block *xmlrpc_utf8_to_wcs (xmlrpc_env *env,
274 char *utf8_data,
275 size_t utf8_len)
277 xmlrpc_mem_block *output;
278 size_t wcs_length;
280 /* Allocate a memory block large enough to hold any possible output.
281 ** We assume that each byte of the input may decode to a whcar_t. */
282 output = XMLRPC_TYPED_MEM_BLOCK_NEW(wchar_t, env, utf8_len);
283 XMLRPC_FAIL_IF_FAULT(env);
285 /* Decode the UTF-8 data. */
286 decode_utf8(env, utf8_data, utf8_len,
287 XMLRPC_TYPED_MEM_BLOCK_CONTENTS(wchar_t, output),
288 &wcs_length);
289 XMLRPC_FAIL_IF_FAULT(env);
291 /* Make sure we didn't overrun our buffer. */
292 XMLRPC_ASSERT(wcs_length <= utf8_len);
294 /* Correct the length of the memory block. */
295 XMLRPC_TYPED_MEM_BLOCK_RESIZE(wchar_t, env, output, wcs_length);
296 XMLRPC_FAIL_IF_FAULT(env);
298 cleanup:
299 if (env->fault_occurred) {
300 if (output)
301 xmlrpc_mem_block_free(output);
302 return NULL;
304 return output;
308 /*=========================================================================
309 ** xmlrpc_utf8_to_wcs
310 **=========================================================================
311 ** Encode a "wide character string" as UTF-8.
314 xmlrpc_mem_block *xmlrpc_wcs_to_utf8 (xmlrpc_env *env,
315 wchar_t *wcs_data,
316 size_t wcs_len)
318 size_t estimate, bytes_used, i;
319 xmlrpc_mem_block *output;
320 unsigned char *buffer;
321 wchar_t wc;
322 int cwc;
324 XMLRPC_ASSERT_ENV_OK(env);
325 XMLRPC_ASSERT_PTR_OK(wcs_data);
327 /* Allocate a memory block large enough to hold any possible output.
328 ** We assume that every wchar might encode to the maximum length. */
329 estimate = wcs_len * MAX_ENCODED_BYTES;
330 output = XMLRPC_TYPED_MEM_BLOCK_NEW(char, env, estimate);
331 XMLRPC_FAIL_IF_FAULT(env);
333 /* Output our characters. */
334 buffer = (unsigned char*) XMLRPC_TYPED_MEM_BLOCK_CONTENTS(char, output);
335 bytes_used = 0;
336 for (i = 0; i < wcs_len; i++) {
337 wc = wcs_data[i];
338 cwc = wc;
339 if (cwc <= 0x007F) {
340 buffer[bytes_used++] = wc & 0x7F;
341 } else if (cwc <= 0x07FF) {
342 /* 110xxxxx 10xxxxxx */
343 buffer[bytes_used++] = 0xC0 | (wc >> 6);
344 buffer[bytes_used++] = 0x80 | (wc & 0x3F);
345 } else if (cwc <= 0xFFFF) {
346 /* 1110xxxx 10xxxxxx 10xxxxxx */
347 buffer[bytes_used++] = 0xE0 | (wc >> 12);
348 buffer[bytes_used++] = 0x80 | ((wc >> 6) & 0x3F);
349 buffer[bytes_used++] = 0x80 | (wc & 0x3F);
350 } else {
351 XMLRPC_FAIL(env, XMLRPC_INTERNAL_ERROR,
352 "Don't know how to encode UCS-4 characters yet");
356 /* Make sure we didn't overrun our buffer. */
357 XMLRPC_ASSERT(bytes_used <= estimate);
359 /* Correct the length of the memory block. */
360 XMLRPC_TYPED_MEM_BLOCK_RESIZE(char, env, output, bytes_used);
361 XMLRPC_FAIL_IF_FAULT(env);
363 cleanup:
364 if (env->fault_occurred) {
365 if (output)
366 xmlrpc_mem_block_free(output);
367 return NULL;
369 return output;
372 #endif /* HAVE_UNICODE_WCHAR */