Win32: fix an incorrect error status being propagated to the caller in case
[svn/apache.git] / subversion / libsvn_subr / utf.c
blob5164f649edbcc0cbd634263aa93464e3d49d32df
1 /*
2 * utf.c: UTF-8 conversion routines
4 * ====================================================================
5 * Licensed to the Apache Software Foundation (ASF) under one
6 * or more contributor license agreements. See the NOTICE file
7 * distributed with this work for additional information
8 * regarding copyright ownership. The ASF licenses this file
9 * to you under the Apache License, Version 2.0 (the
10 * "License"); you may not use this file except in compliance
11 * with the License. You may obtain a copy of the License at
13 * http://www.apache.org/licenses/LICENSE-2.0
15 * Unless required by applicable law or agreed to in writing,
16 * software distributed under the License is distributed on an
17 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
18 * KIND, either express or implied. See the License for the
19 * specific language governing permissions and limitations
20 * under the License.
21 * ====================================================================
26 #include <stdlib.h>
27 #include <string.h>
28 #include <assert.h>
30 #include <apr_strings.h>
31 #include <apr_lib.h>
32 #include <apr_xlate.h>
33 #include <apr_atomic.h>
35 #include "svn_hash.h"
36 #include "svn_string.h"
37 #include "svn_error.h"
38 #include "svn_pools.h"
39 #include "svn_ctype.h"
40 #include "svn_utf.h"
41 #include "svn_private_config.h"
42 #include "win32_xlate.h"
44 #include "private/svn_utf_private.h"
45 #include "private/svn_dep_compat.h"
46 #include "private/svn_string_private.h"
47 #include "private/svn_mutex.h"
51 /* Use these static strings to maximize performance on standard conversions.
52 * Any strings on other locations are still valid, however.
54 static const char *SVN_UTF_NTOU_XLATE_HANDLE = "svn-utf-ntou-xlate-handle";
55 static const char *SVN_UTF_UTON_XLATE_HANDLE = "svn-utf-uton-xlate-handle";
57 static const char *SVN_APR_UTF8_CHARSET = "UTF-8";
59 static svn_mutex__t *xlate_handle_mutex = NULL;
60 static svn_boolean_t assume_native_charset_is_utf8 = FALSE;
62 #if defined(WIN32)
63 typedef svn_subr__win32_xlate_t xlate_handle_t;
64 #else
65 typedef apr_xlate_t xlate_handle_t;
66 #endif
68 /* The xlate handle cache is a global hash table with linked lists of xlate
69 * handles. In multi-threaded environments, a thread "borrows" an xlate
70 * handle from the cache during a translation and puts it back afterwards.
71 * This avoids holding a global lock for all translations.
72 * If there is no handle for a particular key when needed, a new is
73 * handle is created and put in the cache after use.
74 * This means that there will be at most N handles open for a key, where N
75 * is the number of simultanous handles in use for that key. */
77 typedef struct xlate_handle_node_t {
78 xlate_handle_t *handle;
79 /* FALSE if the handle is not valid, since its pool is being
80 destroyed. */
81 svn_boolean_t valid;
82 /* The name of a char encoding or APR_LOCALE_CHARSET. */
83 const char *frompage, *topage;
84 struct xlate_handle_node_t *next;
85 } xlate_handle_node_t;
87 /* This maps const char * userdata_key strings to xlate_handle_node_t **
88 handles to the first entry in the linked list of xlate handles. We don't
89 store the pointer to the list head directly in the hash table, since we
90 remove/insert entries at the head in the list in the code below, and
91 we can't use apr_hash_set() in each character translation because that
92 function allocates memory in each call where the value is non-NULL.
93 Since these allocations take place in a global pool, this would be a
94 memory leak. */
95 static apr_hash_t *xlate_handle_hash = NULL;
97 /* "1st level cache" to standard conversion maps. We may access these
98 * using atomic xchange ops, i.e. without further thread synchronization.
99 * If the respective item is NULL, fallback to hash lookup.
101 static void * volatile xlat_ntou_static_handle = NULL;
102 static void * volatile xlat_uton_static_handle = NULL;
104 /* Clean up the xlate handle cache. */
105 static apr_status_t
106 xlate_cleanup(void *arg)
108 /* We set the cache variables to NULL so that translation works in other
109 cleanup functions, even if it isn't cached then. */
110 xlate_handle_hash = NULL;
112 /* ensure no stale objects get accessed */
113 xlat_ntou_static_handle = NULL;
114 xlat_uton_static_handle = NULL;
116 return APR_SUCCESS;
119 /* Set the handle of ARG to NULL. */
120 static apr_status_t
121 xlate_handle_node_cleanup(void *arg)
123 xlate_handle_node_t *node = arg;
125 node->valid = FALSE;
126 return APR_SUCCESS;
129 void
130 svn_utf_initialize2(svn_boolean_t assume_native_utf8,
131 apr_pool_t *pool)
133 if (!xlate_handle_hash)
135 /* We create our own subpool, which we protect with the mutex.
136 We can't use the pool passed to us by the caller, since we will
137 use it for xlate handle allocations, possibly in multiple threads,
138 and pool allocation is not thread-safe. */
139 apr_pool_t *subpool = svn_pool_create(pool);
140 svn_mutex__t *mutex;
141 svn_error_t *err = svn_mutex__init(&mutex, TRUE, subpool);
142 if (err)
144 svn_error_clear(err);
145 return;
148 xlate_handle_mutex = mutex;
149 xlate_handle_hash = apr_hash_make(subpool);
151 apr_pool_cleanup_register(subpool, NULL, xlate_cleanup,
152 apr_pool_cleanup_null);
155 if (!assume_native_charset_is_utf8)
156 assume_native_charset_is_utf8 = assume_native_utf8;
159 /* Return a unique string key based on TOPAGE and FROMPAGE. TOPAGE and
160 * FROMPAGE can be any valid arguments of the same name to
161 * apr_xlate_open(). Allocate the returned string in POOL. */
162 static const char*
163 get_xlate_key(const char *topage,
164 const char *frompage,
165 apr_pool_t *pool)
167 /* In the cases of SVN_APR_LOCALE_CHARSET and SVN_APR_DEFAULT_CHARSET
168 * topage/frompage is really an int, not a valid string. So generate a
169 * unique key accordingly. */
170 if (frompage == SVN_APR_LOCALE_CHARSET)
171 frompage = "APR_LOCALE_CHARSET";
172 else if (frompage == SVN_APR_DEFAULT_CHARSET)
173 frompage = "APR_DEFAULT_CHARSET";
175 if (topage == SVN_APR_LOCALE_CHARSET)
176 topage = "APR_LOCALE_CHARSET";
177 else if (topage == SVN_APR_DEFAULT_CHARSET)
178 topage = "APR_DEFAULT_CHARSET";
180 return apr_pstrcat(pool, "svn-utf-", frompage, "to", topage,
181 "-xlate-handle", SVN_VA_NULL);
184 /* Atomically replace the content in *MEM with NEW_VALUE and return
185 * the previous content of *MEM. If atomicy cannot be guaranteed,
186 * *MEM will not be modified and NEW_VALUE is simply returned to
187 * the caller.
189 static APR_INLINE void*
190 atomic_swap(void * volatile * mem, void *new_value)
192 #if APR_HAS_THREADS
193 return svn_atomic_xchgptr(mem, new_value);
194 #else
195 /* no threads - no sync. necessary */
196 void *old_value = (void*)*mem;
197 *mem = new_value;
198 return old_value;
199 #endif
202 /* Set *RET to a newly created handle node for converting from FROMPAGE
203 to TOPAGE, If apr_xlate_open() returns APR_EINVAL or APR_ENOTIMPL, set
204 (*RET)->handle to NULL. If fail for any other reason, return the error.
205 Allocate *RET and its xlate handle in POOL. */
206 static svn_error_t *
207 xlate_alloc_handle(xlate_handle_node_t **ret,
208 const char *topage, const char *frompage,
209 apr_pool_t *pool)
211 apr_status_t apr_err;
212 xlate_handle_t *handle;
213 const char *name;
215 /* The error handling doesn't support the following cases, since we don't
216 use them currently. Catch this here. */
217 SVN_ERR_ASSERT(frompage != SVN_APR_DEFAULT_CHARSET
218 && topage != SVN_APR_DEFAULT_CHARSET
219 && (frompage != SVN_APR_LOCALE_CHARSET
220 || topage != SVN_APR_LOCALE_CHARSET));
222 /* Try to create a handle. */
223 #if defined(WIN32)
224 apr_err = svn_subr__win32_xlate_open(&handle, topage,
225 frompage, pool);
226 name = "win32-xlate: ";
227 #else
228 apr_err = apr_xlate_open(&handle, topage, frompage, pool);
229 name = "APR: ";
230 #endif
232 if (APR_STATUS_IS_EINVAL(apr_err) || APR_STATUS_IS_ENOTIMPL(apr_err))
233 handle = NULL;
234 else if (apr_err != APR_SUCCESS)
236 const char *errstr;
237 char apr_strerr[512];
239 /* Can't use svn_error_wrap_apr here because it calls functions in
240 this file, leading to infinite recursion. */
241 if (frompage == SVN_APR_LOCALE_CHARSET)
242 errstr = apr_psprintf(pool,
243 _("Can't create a character converter from "
244 "native encoding to '%s'"), topage);
245 else if (topage == SVN_APR_LOCALE_CHARSET)
246 errstr = apr_psprintf(pool,
247 _("Can't create a character converter from "
248 "'%s' to native encoding"), frompage);
249 else
250 errstr = apr_psprintf(pool,
251 _("Can't create a character converter from "
252 "'%s' to '%s'"), frompage, topage);
254 /* Just put the error on the stack, since svn_error_create duplicates it
255 later. APR_STRERR will be in the local encoding, not in UTF-8, though.
257 svn_strerror(apr_err, apr_strerr, sizeof(apr_strerr));
258 return svn_error_createf(SVN_ERR_PLUGIN_LOAD_FAILURE,
259 svn_error_create(apr_err, NULL, apr_strerr),
260 "%s%s", name, errstr);
263 /* Allocate and initialize the node. */
264 *ret = apr_palloc(pool, sizeof(xlate_handle_node_t));
265 (*ret)->handle = handle;
266 (*ret)->valid = TRUE;
267 (*ret)->frompage = ((frompage != SVN_APR_LOCALE_CHARSET)
268 ? apr_pstrdup(pool, frompage) : frompage);
269 (*ret)->topage = ((topage != SVN_APR_LOCALE_CHARSET)
270 ? apr_pstrdup(pool, topage) : topage);
271 (*ret)->next = NULL;
273 /* If we are called from inside a pool cleanup handler, the just created
274 xlate handle will be closed when that handler returns by a newly
275 registered cleanup handler, however, the handle is still cached by us.
276 To prevent this, we register a cleanup handler that will reset the valid
277 flag of our node, so we don't use an invalid handle. */
278 if (handle)
279 apr_pool_cleanup_register(pool, *ret, xlate_handle_node_cleanup,
280 apr_pool_cleanup_null);
282 return SVN_NO_ERROR;
285 /* Extend xlate_alloc_handle by using USERDATA_KEY as a key in our
286 global hash map, if available.
288 Allocate *RET and its xlate handle in POOL if svn_utf_initialize()
289 hasn't been called or USERDATA_KEY is NULL. Else, allocate them
290 in the pool of xlate_handle_hash.
292 Note: this function is not thread-safe. Call get_xlate_handle_node
293 instead. */
294 static svn_error_t *
295 get_xlate_handle_node_internal(xlate_handle_node_t **ret,
296 const char *topage, const char *frompage,
297 const char *userdata_key, apr_pool_t *pool)
299 /* If we already have a handle, just return it. */
300 if (userdata_key && xlate_handle_hash)
302 xlate_handle_node_t *old_node = NULL;
304 /* 2nd level: hash lookup */
305 xlate_handle_node_t **old_node_p = svn_hash_gets(xlate_handle_hash,
306 userdata_key);
307 if (old_node_p)
308 old_node = *old_node_p;
309 if (old_node)
311 /* Ensure that the handle is still valid. */
312 if (old_node->valid)
314 /* Remove from the list. */
315 *old_node_p = old_node->next;
316 old_node->next = NULL;
317 *ret = old_node;
318 return SVN_NO_ERROR;
323 /* Note that we still have the mutex locked (if it is initialized), so we
324 can use the global pool for creating the new xlate handle. */
326 /* Use the correct pool for creating the handle. */
327 pool = apr_hash_pool_get(xlate_handle_hash);
329 return xlate_alloc_handle(ret, topage, frompage, pool);
332 /* Set *RET to a handle node for converting from FROMPAGE to TOPAGE,
333 creating the handle node if it doesn't exist in USERDATA_KEY.
334 If a node is not cached and apr_xlate_open() returns APR_EINVAL or
335 APR_ENOTIMPL, set (*RET)->handle to NULL. If fail for any other
336 reason, return the error.
338 Allocate *RET and its xlate handle in POOL if svn_utf_initialize()
339 hasn't been called or USERDATA_KEY is NULL. Else, allocate them
340 in the pool of xlate_handle_hash. */
341 static svn_error_t *
342 get_xlate_handle_node(xlate_handle_node_t **ret,
343 const char *topage, const char *frompage,
344 const char *userdata_key, apr_pool_t *pool)
346 xlate_handle_node_t *old_node = NULL;
348 /* If we already have a handle, just return it. */
349 if (userdata_key)
351 if (xlate_handle_hash)
353 /* 1st level: global, static items */
354 if (userdata_key == SVN_UTF_NTOU_XLATE_HANDLE)
355 old_node = atomic_swap(&xlat_ntou_static_handle, NULL);
356 else if (userdata_key == SVN_UTF_UTON_XLATE_HANDLE)
357 old_node = atomic_swap(&xlat_uton_static_handle, NULL);
359 if (old_node && old_node->valid)
361 *ret = old_node;
362 return SVN_NO_ERROR;
365 else
367 void *p;
368 /* We fall back on a per-pool cache instead. */
369 apr_pool_userdata_get(&p, userdata_key, pool);
370 old_node = p;
371 /* Ensure that the handle is still valid. */
372 if (old_node && old_node->valid)
374 *ret = old_node;
375 return SVN_NO_ERROR;
378 return xlate_alloc_handle(ret, topage, frompage, pool);
382 SVN_MUTEX__WITH_LOCK(xlate_handle_mutex,
383 get_xlate_handle_node_internal(ret,
384 topage,
385 frompage,
386 userdata_key,
387 pool));
389 return SVN_NO_ERROR;
392 /* Put back NODE into the xlate handle cache for use by other calls.
394 Note: this function is not thread-safe. Call put_xlate_handle_node
395 instead. */
396 static svn_error_t *
397 put_xlate_handle_node_internal(xlate_handle_node_t *node,
398 const char *userdata_key)
400 xlate_handle_node_t **node_p = svn_hash_gets(xlate_handle_hash, userdata_key);
401 if (node_p == NULL)
403 userdata_key = apr_pstrdup(apr_hash_pool_get(xlate_handle_hash),
404 userdata_key);
405 node_p = apr_palloc(apr_hash_pool_get(xlate_handle_hash),
406 sizeof(*node_p));
407 *node_p = NULL;
408 svn_hash_sets(xlate_handle_hash, userdata_key, node_p);
410 node->next = *node_p;
411 *node_p = node;
413 return SVN_NO_ERROR;
416 /* Put back NODE into the xlate handle cache for use by other calls.
417 If there is no global cache, store the handle in POOL.
418 Ignore errors related to locking/unlocking the mutex. */
419 static svn_error_t *
420 put_xlate_handle_node(xlate_handle_node_t *node,
421 const char *userdata_key,
422 apr_pool_t *pool)
424 assert(node->next == NULL);
425 if (!userdata_key)
426 return SVN_NO_ERROR;
428 /* push previous global node to the hash */
429 if (xlate_handle_hash)
431 /* 1st level: global, static items */
432 if (userdata_key == SVN_UTF_NTOU_XLATE_HANDLE)
433 node = atomic_swap(&xlat_ntou_static_handle, node);
434 else if (userdata_key == SVN_UTF_UTON_XLATE_HANDLE)
435 node = atomic_swap(&xlat_uton_static_handle, node);
436 if (node == NULL)
437 return SVN_NO_ERROR;
439 SVN_MUTEX__WITH_LOCK(xlate_handle_mutex,
440 put_xlate_handle_node_internal(node,
441 userdata_key));
443 else
445 /* Store it in the per-pool cache. */
446 apr_pool_userdata_set(node, userdata_key, apr_pool_cleanup_null, pool);
449 return SVN_NO_ERROR;
452 /* Return the apr_xlate handle for converting native characters to UTF-8. */
453 static svn_error_t *
454 get_ntou_xlate_handle_node(xlate_handle_node_t **ret, apr_pool_t *pool)
456 return get_xlate_handle_node(ret, SVN_APR_UTF8_CHARSET,
457 assume_native_charset_is_utf8
458 ? SVN_APR_UTF8_CHARSET
459 : SVN_APR_LOCALE_CHARSET,
460 SVN_UTF_NTOU_XLATE_HANDLE, pool);
464 /* Return the apr_xlate handle for converting UTF-8 to native characters.
465 Create one if it doesn't exist. If unable to find a handle, or
466 unable to create one because apr_xlate_open returned APR_EINVAL, then
467 set *RET to null and return SVN_NO_ERROR; if fail for some other
468 reason, return error. */
469 static svn_error_t *
470 get_uton_xlate_handle_node(xlate_handle_node_t **ret, apr_pool_t *pool)
472 return get_xlate_handle_node(ret,
473 assume_native_charset_is_utf8
474 ? SVN_APR_UTF8_CHARSET
475 : SVN_APR_LOCALE_CHARSET,
476 SVN_APR_UTF8_CHARSET,
477 SVN_UTF_UTON_XLATE_HANDLE, pool);
481 /* Convert SRC_LENGTH bytes of SRC_DATA in NODE->handle, store the result
482 in *DEST, which is allocated in POOL. */
483 static svn_error_t *
484 convert_to_stringbuf(xlate_handle_node_t *node,
485 const char *src_data,
486 apr_size_t src_length,
487 svn_stringbuf_t **dest,
488 apr_pool_t *pool)
490 #ifdef WIN32
491 apr_status_t apr_err;
493 apr_err = svn_subr__win32_xlate_to_stringbuf(node->handle, src_data,
494 src_length, dest, pool);
495 #else
496 apr_size_t buflen = src_length * 2;
497 apr_status_t apr_err;
498 apr_size_t srclen = src_length;
499 apr_size_t destlen = buflen;
501 /* Initialize *DEST to an empty stringbuf.
502 A 1:2 ratio of input bytes to output bytes (as assigned above)
503 should be enough for most translations, and if it turns out not
504 to be enough, we'll grow the buffer again, sizing it based on a
505 1:3 ratio of the remainder of the string. */
506 *dest = svn_stringbuf_create_ensure(buflen + 1, pool);
508 /* Not only does it not make sense to convert an empty string, but
509 apr-iconv is quite unreasonable about not allowing that. */
510 if (src_length == 0)
511 return SVN_NO_ERROR;
515 /* Set up state variables for xlate. */
516 destlen = buflen - (*dest)->len;
518 /* Attempt the conversion. */
519 apr_err = apr_xlate_conv_buffer(node->handle,
520 src_data + (src_length - srclen),
521 &srclen,
522 (*dest)->data + (*dest)->len,
523 &destlen);
525 /* Now, update the *DEST->len to track the amount of output data
526 churned out so far from this loop. */
527 (*dest)->len += ((buflen - (*dest)->len) - destlen);
528 buflen += srclen * 3; /* 3 is middle ground, 2 wasn't enough
529 for all characters in the buffer, 4 is
530 maximum character size (currently) */
533 } while (apr_err == APR_SUCCESS && srclen != 0);
534 #endif
536 /* If we exited the loop with an error, return the error. */
537 if (apr_err)
539 const char *errstr;
540 svn_error_t *err;
542 /* Can't use svn_error_wrap_apr here because it calls functions in
543 this file, leading to infinite recursion. */
544 if (node->frompage == SVN_APR_LOCALE_CHARSET)
545 errstr = apr_psprintf
546 (pool, _("Can't convert string from native encoding to '%s':"),
547 node->topage);
548 else if (node->topage == SVN_APR_LOCALE_CHARSET)
549 errstr = apr_psprintf
550 (pool, _("Can't convert string from '%s' to native encoding:"),
551 node->frompage);
552 else
553 errstr = apr_psprintf
554 (pool, _("Can't convert string from '%s' to '%s':"),
555 node->frompage, node->topage);
557 err = svn_error_create(
558 apr_err, NULL, svn_utf__fuzzy_escape(src_data, src_length, pool));
559 return svn_error_create(apr_err, err, errstr);
561 /* Else, exited due to success. Trim the result buffer down to the
562 right length. */
563 (*dest)->data[(*dest)->len] = '\0';
565 return SVN_NO_ERROR;
569 /* Return APR_EINVAL if the first LEN bytes of DATA contain anything
570 other than seven-bit, non-control (except for whitespace) ASCII
571 characters, finding the error pool from POOL. Otherwise, return
572 SVN_NO_ERROR. */
573 static svn_error_t *
574 check_non_ascii(const char *data, apr_size_t len, apr_pool_t *pool)
576 const char *data_start = data;
578 for (; len > 0; --len, data++)
580 if ((! svn_ctype_isascii(*data))
581 || ((! svn_ctype_isspace(*data))
582 && svn_ctype_iscntrl(*data)))
584 /* Show the printable part of the data, followed by the
585 decimal code of the questionable character. Because if a
586 user ever gets this error, she's going to have to spend
587 time tracking down the non-ASCII data, so we want to help
588 as much as possible. And yes, we just call the unsafe
589 data "non-ASCII", even though the actual constraint is
590 somewhat more complex than that. */
592 if (data - data_start)
594 const char *error_data
595 = apr_pstrndup(pool, data_start, (data - data_start));
597 return svn_error_createf
598 (APR_EINVAL, NULL,
599 _("Safe data '%s' was followed by non-ASCII byte %d: "
600 "unable to convert to/from UTF-8"),
601 error_data, *((const unsigned char *) data));
603 else
605 return svn_error_createf
606 (APR_EINVAL, NULL,
607 _("Non-ASCII character (code %d) detected, "
608 "and unable to convert to/from UTF-8"),
609 *((const unsigned char *) data));
614 return SVN_NO_ERROR;
617 /* Construct an error with code APR_EINVAL and with a suitable message
618 * to describe the invalid UTF-8 sequence DATA of length LEN (which
619 * may have embedded NULLs). We can't simply print the data, almost
620 * by definition we don't really know how it is encoded.
622 static svn_error_t *
623 invalid_utf8(const char *data, apr_size_t len, apr_pool_t *pool)
625 const char *last = svn_utf__last_valid(data, len);
626 const char *valid_txt = "", *invalid_txt = "";
627 apr_size_t i;
628 size_t valid, invalid;
630 /* We will display at most 24 valid octets (this may split a leading
631 multi-byte character) as that should fit on one 80 character line. */
632 valid = last - data;
633 if (valid > 24)
634 valid = 24;
635 for (i = 0; i < valid; ++i)
636 valid_txt = apr_pstrcat(pool, valid_txt,
637 apr_psprintf(pool, " %02x",
638 (unsigned char)last[i-valid]),
639 SVN_VA_NULL);
641 /* 4 invalid octets will guarantee that the faulty octet is displayed */
642 invalid = data + len - last;
643 if (invalid > 4)
644 invalid = 4;
645 for (i = 0; i < invalid; ++i)
646 invalid_txt = apr_pstrcat(pool, invalid_txt,
647 apr_psprintf(pool, " %02x",
648 (unsigned char)last[i]),
649 SVN_VA_NULL);
651 return svn_error_createf(APR_EINVAL, NULL,
652 _("Valid UTF-8 data\n(hex:%s)\n"
653 "followed by invalid UTF-8 sequence\n(hex:%s)"),
654 valid_txt, invalid_txt);
657 /* Verify that the sequence DATA of length LEN is valid UTF-8.
658 If it is not, return an error with code APR_EINVAL. */
659 static svn_error_t *
660 check_utf8(const char *data, apr_size_t len, apr_pool_t *pool)
662 if (! svn_utf__is_valid(data, len))
663 return invalid_utf8(data, len, pool);
664 return SVN_NO_ERROR;
667 /* Verify that the NULL terminated sequence DATA is valid UTF-8.
668 If it is not, return an error with code APR_EINVAL. */
669 static svn_error_t *
670 check_cstring_utf8(const char *data, apr_pool_t *pool)
673 if (! svn_utf__cstring_is_valid(data))
674 return invalid_utf8(data, strlen(data), pool);
675 return SVN_NO_ERROR;
679 svn_error_t *
680 svn_utf_stringbuf_to_utf8(svn_stringbuf_t **dest,
681 const svn_stringbuf_t *src,
682 apr_pool_t *pool)
684 xlate_handle_node_t *node;
685 svn_error_t *err;
687 SVN_ERR(get_ntou_xlate_handle_node(&node, pool));
689 if (node->handle)
691 err = convert_to_stringbuf(node, src->data, src->len, dest, pool);
692 if (! err)
693 err = check_utf8((*dest)->data, (*dest)->len, pool);
695 else
697 err = check_non_ascii(src->data, src->len, pool);
698 if (! err)
699 *dest = svn_stringbuf_dup(src, pool);
702 return svn_error_compose_create(err,
703 put_xlate_handle_node
704 (node,
705 SVN_UTF_NTOU_XLATE_HANDLE,
706 pool));
710 svn_error_t *
711 svn_utf_string_to_utf8(const svn_string_t **dest,
712 const svn_string_t *src,
713 apr_pool_t *pool)
715 svn_stringbuf_t *destbuf;
716 xlate_handle_node_t *node;
717 svn_error_t *err;
719 SVN_ERR(get_ntou_xlate_handle_node(&node, pool));
721 if (node->handle)
723 err = convert_to_stringbuf(node, src->data, src->len, &destbuf, pool);
724 if (! err)
725 err = check_utf8(destbuf->data, destbuf->len, pool);
726 if (! err)
727 *dest = svn_stringbuf__morph_into_string(destbuf);
729 else
731 err = check_non_ascii(src->data, src->len, pool);
732 if (! err)
733 *dest = svn_string_dup(src, pool);
736 return svn_error_compose_create(err,
737 put_xlate_handle_node
738 (node,
739 SVN_UTF_NTOU_XLATE_HANDLE,
740 pool));
744 /* Common implementation for svn_utf_cstring_to_utf8,
745 svn_utf_cstring_to_utf8_ex, svn_utf_cstring_from_utf8 and
746 svn_utf_cstring_from_utf8_ex. Convert SRC to DEST using NODE->handle as
747 the translator and allocating from POOL. */
748 static svn_error_t *
749 convert_cstring(const char **dest,
750 const char *src,
751 xlate_handle_node_t *node,
752 apr_pool_t *pool)
754 if (node->handle)
756 svn_stringbuf_t *destbuf;
757 SVN_ERR(convert_to_stringbuf(node, src, strlen(src),
758 &destbuf, pool));
759 *dest = destbuf->data;
761 else
763 apr_size_t len = strlen(src);
764 SVN_ERR(check_non_ascii(src, len, pool));
765 *dest = apr_pstrmemdup(pool, src, len);
767 return SVN_NO_ERROR;
771 svn_error_t *
772 svn_utf_cstring_to_utf8(const char **dest,
773 const char *src,
774 apr_pool_t *pool)
776 xlate_handle_node_t *node;
777 svn_error_t *err;
779 SVN_ERR(get_ntou_xlate_handle_node(&node, pool));
780 err = convert_cstring(dest, src, node, pool);
781 SVN_ERR(svn_error_compose_create(err,
782 put_xlate_handle_node
783 (node,
784 SVN_UTF_NTOU_XLATE_HANDLE,
785 pool)));
786 return check_cstring_utf8(*dest, pool);
790 svn_error_t *
791 svn_utf_cstring_to_utf8_ex2(const char **dest,
792 const char *src,
793 const char *frompage,
794 apr_pool_t *pool)
796 xlate_handle_node_t *node;
797 svn_error_t *err;
798 const char *convset_key = get_xlate_key(SVN_APR_UTF8_CHARSET, frompage,
799 pool);
801 SVN_ERR(get_xlate_handle_node(&node, SVN_APR_UTF8_CHARSET, frompage,
802 convset_key, pool));
803 err = convert_cstring(dest, src, node, pool);
804 SVN_ERR(svn_error_compose_create(err,
805 put_xlate_handle_node
806 (node,
807 SVN_UTF_NTOU_XLATE_HANDLE,
808 pool)));
810 return check_cstring_utf8(*dest, pool);
814 svn_error_t *
815 svn_utf_cstring_to_utf8_ex(const char **dest,
816 const char *src,
817 const char *frompage,
818 const char *convset_key,
819 apr_pool_t *pool)
821 return svn_utf_cstring_to_utf8_ex2(dest, src, frompage, pool);
825 svn_error_t *
826 svn_utf_stringbuf_from_utf8(svn_stringbuf_t **dest,
827 const svn_stringbuf_t *src,
828 apr_pool_t *pool)
830 xlate_handle_node_t *node;
831 svn_error_t *err;
833 SVN_ERR(get_uton_xlate_handle_node(&node, pool));
835 if (node->handle)
837 err = check_utf8(src->data, src->len, pool);
838 if (! err)
839 err = convert_to_stringbuf(node, src->data, src->len, dest, pool);
841 else
843 err = check_non_ascii(src->data, src->len, pool);
844 if (! err)
845 *dest = svn_stringbuf_dup(src, pool);
848 err = svn_error_compose_create(
849 err,
850 put_xlate_handle_node(node, SVN_UTF_UTON_XLATE_HANDLE, pool));
852 return err;
856 svn_error_t *
857 svn_utf_string_from_utf8(const svn_string_t **dest,
858 const svn_string_t *src,
859 apr_pool_t *pool)
861 xlate_handle_node_t *node;
862 svn_error_t *err;
864 SVN_ERR(get_uton_xlate_handle_node(&node, pool));
866 if (node->handle)
868 err = check_utf8(src->data, src->len, pool);
869 if (! err)
871 svn_stringbuf_t *dbuf;
873 err = convert_to_stringbuf(node, src->data, src->len,
874 &dbuf, pool);
876 if (! err)
877 *dest = svn_stringbuf__morph_into_string(dbuf);
880 else
882 err = check_non_ascii(src->data, src->len, pool);
883 if (! err)
884 *dest = svn_string_dup(src, pool);
887 err = svn_error_compose_create(
888 err,
889 put_xlate_handle_node(node, SVN_UTF_UTON_XLATE_HANDLE, pool));
891 return err;
895 svn_error_t *
896 svn_utf_cstring_from_utf8(const char **dest,
897 const char *src,
898 apr_pool_t *pool)
900 xlate_handle_node_t *node;
901 svn_error_t *err;
903 SVN_ERR(check_cstring_utf8(src, pool));
905 SVN_ERR(get_uton_xlate_handle_node(&node, pool));
906 err = convert_cstring(dest, src, node, pool);
907 err = svn_error_compose_create(
908 err,
909 put_xlate_handle_node(node, SVN_UTF_UTON_XLATE_HANDLE, pool));
911 return err;
915 svn_error_t *
916 svn_utf_cstring_from_utf8_ex2(const char **dest,
917 const char *src,
918 const char *topage,
919 apr_pool_t *pool)
921 xlate_handle_node_t *node;
922 svn_error_t *err;
923 const char *convset_key = get_xlate_key(topage, SVN_APR_UTF8_CHARSET,
924 pool);
926 SVN_ERR(check_cstring_utf8(src, pool));
928 SVN_ERR(get_xlate_handle_node(&node, topage, SVN_APR_UTF8_CHARSET,
929 convset_key, pool));
930 err = convert_cstring(dest, src, node, pool);
931 err = svn_error_compose_create(
932 err,
933 put_xlate_handle_node(node, convset_key, pool));
935 return err;
938 const char *
939 svn_utf__cstring_from_utf8_fuzzy(const char *src,
940 apr_pool_t *pool,
941 svn_error_t *(*convert_from_utf8)
942 (const char **, const char *, apr_pool_t *))
944 const char *escaped, *converted;
945 svn_error_t *err;
947 escaped = svn_utf__fuzzy_escape(src, strlen(src), pool);
949 /* Okay, now we have a *new* UTF-8 string, one that's guaranteed to
950 contain only 7-bit bytes :-). Recode to native... */
951 err = convert_from_utf8(((const char **) &converted), escaped, pool);
953 if (err)
955 svn_error_clear(err);
956 return escaped;
958 else
959 return converted;
961 /* ### Check the client locale, maybe we can avoid that second
962 * conversion! See Ulrich Drepper's patch at
963 * https://issues.apache.org/jira/browse/SVN-807.
968 const char *
969 svn_utf_cstring_from_utf8_fuzzy(const char *src,
970 apr_pool_t *pool)
972 return svn_utf__cstring_from_utf8_fuzzy(src, pool,
973 svn_utf_cstring_from_utf8);
977 svn_error_t *
978 svn_utf_cstring_from_utf8_stringbuf(const char **dest,
979 const svn_stringbuf_t *src,
980 apr_pool_t *pool)
982 svn_stringbuf_t *destbuf;
984 SVN_ERR(svn_utf_stringbuf_from_utf8(&destbuf, src, pool));
985 *dest = destbuf->data;
987 return SVN_NO_ERROR;
991 svn_error_t *
992 svn_utf_cstring_from_utf8_string(const char **dest,
993 const svn_string_t *src,
994 apr_pool_t *pool)
996 xlate_handle_node_t *node;
997 svn_error_t *err;
999 SVN_ERR(get_uton_xlate_handle_node(&node, pool));
1001 if (node->handle)
1003 err = check_utf8(src->data, src->len, pool);
1004 if (! err)
1006 svn_stringbuf_t *dbuf;
1008 err = convert_to_stringbuf(node, src->data, src->len,
1009 &dbuf, pool);
1010 if (! err)
1011 *dest = dbuf->data;
1014 else
1016 err = check_non_ascii(src->data, src->len, pool);
1017 if (! err)
1018 *dest = apr_pstrmemdup(pool, src->data, src->len);
1021 err = svn_error_compose_create(
1022 err,
1023 put_xlate_handle_node(node, SVN_UTF_UTON_XLATE_HANDLE, pool));
1025 return err;
1029 /* Insert the given UCS-4 VALUE into BUF at the given OFFSET. */
1030 static void
1031 membuf_insert_ucs4(svn_membuf_t *buf, apr_size_t offset, apr_int32_t value)
1033 svn_membuf__resize(buf, (offset + 1) * sizeof(value));
1034 ((apr_int32_t*)buf->data)[offset] = value;
1037 /* TODO: Use compiler intrinsics for byte swaps. */
1038 #define SWAP_SHORT(x) ((((x) & 0xff) << 8) | (((x) >> 8) & 0xff))
1039 #define SWAP_LONG(x) ((((x) & 0xff) << 24) | (((x) & 0xff00) << 8) \
1040 | (((x) >> 8) & 0xff00) | (((x) >> 24) & 0xff))
1042 #define IS_UTF16_LEAD_SURROGATE(c) ((c) >= 0xd800 && (c) <= 0xdbff)
1043 #define IS_UTF16_TRAIL_SURROGATE(c) ((c) >= 0xdc00 && (c) <= 0xdfff)
1045 svn_error_t *
1046 svn_utf__utf16_to_utf8(const svn_string_t **result,
1047 const apr_uint16_t *utf16str,
1048 apr_size_t utf16len,
1049 svn_boolean_t big_endian,
1050 apr_pool_t *result_pool,
1051 apr_pool_t *scratch_pool)
1053 static const apr_uint16_t endiancheck = 0xa55a;
1054 const svn_boolean_t arch_big_endian =
1055 (((const char*)&endiancheck)[sizeof(endiancheck) - 1] == '\x5a');
1056 const svn_boolean_t swap_order = (!big_endian != !arch_big_endian);
1058 apr_uint16_t lead_surrogate;
1059 apr_size_t length;
1060 apr_size_t offset;
1061 svn_membuf_t ucs4buf;
1062 svn_membuf_t resultbuf;
1063 svn_string_t *res;
1065 if (utf16len == SVN_UTF__UNKNOWN_LENGTH)
1067 const apr_uint16_t *endp = utf16str;
1068 while (*endp++)
1070 utf16len = (endp - utf16str);
1073 svn_membuf__create(&ucs4buf, utf16len * sizeof(apr_int32_t), scratch_pool);
1075 for (lead_surrogate = 0, length = 0, offset = 0;
1076 offset < utf16len; ++offset)
1078 const apr_uint16_t code =
1079 (swap_order ? SWAP_SHORT(utf16str[offset]) : utf16str[offset]);
1081 if (lead_surrogate)
1083 if (IS_UTF16_TRAIL_SURROGATE(code))
1085 /* Combine the lead and trail currogates into a 32-bit code. */
1086 membuf_insert_ucs4(&ucs4buf, length++,
1087 (0x010000
1088 + (((lead_surrogate & 0x03ff) << 10)
1089 | (code & 0x03ff))));
1090 lead_surrogate = 0;
1091 continue;
1093 else
1095 /* If we didn't find a surrogate pair, just dump the
1096 lead surrogate into the stream. */
1097 membuf_insert_ucs4(&ucs4buf, length++, lead_surrogate);
1098 lead_surrogate = 0;
1102 if ((offset + 1) < utf16len && IS_UTF16_LEAD_SURROGATE(code))
1104 /* Store a lead surrogate that is followed by at least one
1105 code for the next iteration. */
1106 lead_surrogate = code;
1107 continue;
1109 else
1110 membuf_insert_ucs4(&ucs4buf, length++, code);
1113 /* Convert the UCS-4 buffer to UTF-8, assuming an average of 2 bytes
1114 per code point for encoding. The buffer will grow as
1115 necessary. */
1116 svn_membuf__create(&resultbuf, length * 2, result_pool);
1117 SVN_ERR(svn_utf__encode_ucs4_string(
1118 &resultbuf, ucs4buf.data, length, &length));
1120 res = apr_palloc(result_pool, sizeof(*res));
1121 res->data = resultbuf.data;
1122 res->len = length;
1123 *result = res;
1124 return SVN_NO_ERROR;
1128 svn_error_t *
1129 svn_utf__utf32_to_utf8(const svn_string_t **result,
1130 const apr_int32_t *utf32str,
1131 apr_size_t utf32len,
1132 svn_boolean_t big_endian,
1133 apr_pool_t *result_pool,
1134 apr_pool_t *scratch_pool)
1136 static const apr_int32_t endiancheck = 0xa5cbbc5a;
1137 const svn_boolean_t arch_big_endian =
1138 (((const char*)&endiancheck)[sizeof(endiancheck) - 1] == '\x5a');
1139 const svn_boolean_t swap_order = (!big_endian != !arch_big_endian);
1141 apr_size_t length;
1142 svn_membuf_t resultbuf;
1143 svn_string_t *res;
1145 if (utf32len == SVN_UTF__UNKNOWN_LENGTH)
1147 const apr_int32_t *endp = utf32str;
1148 while (*endp++)
1150 utf32len = (endp - utf32str);
1153 if (swap_order)
1155 apr_size_t offset;
1156 svn_membuf_t ucs4buf;
1158 svn_membuf__create(&ucs4buf, utf32len * sizeof(apr_int32_t),
1159 scratch_pool);
1161 for (offset = 0; offset < utf32len; ++offset)
1163 const apr_int32_t code = SWAP_LONG(utf32str[offset]);
1164 membuf_insert_ucs4(&ucs4buf, offset, code);
1166 utf32str = ucs4buf.data;
1169 /* Convert the UCS-4 buffer to UTF-8, assuming an average of 2 bytes
1170 per code point for encoding. The buffer will grow as
1171 necessary. */
1172 svn_membuf__create(&resultbuf, utf32len * 2, result_pool);
1173 SVN_ERR(svn_utf__encode_ucs4_string(
1174 &resultbuf, utf32str, utf32len, &length));
1176 res = apr_palloc(result_pool, sizeof(*res));
1177 res->data = resultbuf.data;
1178 res->len = length;
1179 *result = res;
1180 return SVN_NO_ERROR;
1184 #ifdef WIN32
1187 svn_error_t *
1188 svn_utf__win32_utf8_to_utf16(const WCHAR **result,
1189 const char *src,
1190 const WCHAR *prefix,
1191 apr_pool_t *result_pool)
1193 const int utf8_count = strlen(src);
1194 const int prefix_len = (prefix ? lstrlenW(prefix) : 0);
1195 WCHAR *wide_str;
1196 int wide_count;
1198 if (0 == prefix_len + utf8_count)
1200 *result = L"";
1201 return SVN_NO_ERROR;
1204 wide_count = MultiByteToWideChar(CP_UTF8, 0, src, utf8_count, NULL, 0);
1205 if (wide_count == 0)
1206 return svn_error_wrap_apr(apr_get_os_error(),
1207 _("Conversion to UTF-16 failed"));
1209 wide_str = apr_palloc(result_pool,
1210 (prefix_len + wide_count + 1) * sizeof(*wide_str));
1211 if (prefix_len)
1212 memcpy(wide_str, prefix, prefix_len * sizeof(*wide_str));
1213 if (0 == MultiByteToWideChar(CP_UTF8, 0, src, utf8_count,
1214 wide_str + prefix_len, wide_count))
1215 return svn_error_wrap_apr(apr_get_os_error(),
1216 _("Conversion to UTF-16 failed"));
1218 wide_str[prefix_len + wide_count] = 0;
1219 *result = wide_str;
1221 return SVN_NO_ERROR;
1224 svn_error_t *
1225 svn_utf__win32_utf16_to_utf8(const char **result,
1226 const WCHAR *src,
1227 const char *prefix,
1228 apr_pool_t *result_pool)
1230 const int wide_count = lstrlenW(src);
1231 const int prefix_len = (prefix ? strlen(prefix) : 0);
1232 char *utf8_str;
1233 int utf8_count;
1235 if (0 == prefix_len + wide_count)
1237 *result = "";
1238 return SVN_NO_ERROR;
1241 utf8_count = WideCharToMultiByte(CP_UTF8, 0, src, wide_count,
1242 NULL, 0, NULL, FALSE);
1243 if (utf8_count == 0)
1244 return svn_error_wrap_apr(apr_get_os_error(),
1245 _("Conversion from UTF-16 failed"));
1247 utf8_str = apr_palloc(result_pool,
1248 (prefix_len + utf8_count + 1) * sizeof(*utf8_str));
1249 if (prefix_len)
1250 memcpy(utf8_str, prefix, prefix_len * sizeof(*utf8_str));
1251 if (0 == WideCharToMultiByte(CP_UTF8, 0, src, wide_count,
1252 utf8_str + prefix_len, utf8_count,
1253 NULL, FALSE))
1254 return svn_error_wrap_apr(apr_get_os_error(),
1255 _("Conversion from UTF-16 failed"));
1257 utf8_str[prefix_len + utf8_count] = 0;
1258 *result = utf8_str;
1260 return SVN_NO_ERROR;
1263 #endif /* WIN32 */