ctdb-tests: nfs_iterate_test() marks RPC service down
[samba4-gss.git] / lib / util / charset / convert_string.c
blob859b002ecbcf97ca4021331c3fbc24e88c504fd7
1 /*
2 Unix SMB/CIFS implementation.
3 Character set conversion Extensions
4 Copyright (C) Igor Vergeichik <iverg@mail.ru> 2001
5 Copyright (C) Andrew Tridgell 2001-2011
6 Copyright (C) Andrew Bartlett 2011
7 Copyright (C) Simo Sorce 2001
8 Copyright (C) Martin Pool 2003
10 This program is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 3 of the License, or
13 (at your option) any later version.
15 This program is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
20 You should have received a copy of the GNU General Public License
21 along with this program. If not, see <http://www.gnu.org/licenses/>.
24 #include "replace.h"
25 #include "system/iconv.h"
26 #include "charset.h"
27 #include "lib/util/debug.h"
28 #include "lib/util/fault.h"
30 /**
31 * @file
33 * @brief Character-set conversion routines built on our iconv.
35 * @note Samba's internal character set (at least in the 3.0 series)
36 * is always the same as the one for the Unix filesystem. It is
37 * <b>not</b> necessarily UTF-8 and may be different on machines that
38 * need i18n filenames to be compatible with Unix software. It does
39 * have to be a superset of ASCII. All multibyte sequences must start
40 * with a byte with the high bit set.
42 * @sa lib/iconv.c
46 /**
47 * Convert string from one encoding to another, making error checking etc
48 * Slow path version - uses (slow) iconv.
50 * @param src pointer to source string (multibyte or singlebyte)
51 * @param srclen length of the source string in bytes
52 * @param dest pointer to destination string (multibyte or singlebyte)
53 * @param destlen maximal length allowed for string
54 * @param converted size is the number of bytes occupied in the destination
56 * @returns false and sets errno on fail, true on success.
58 * Ensure the srclen contains the terminating zero.
60 **/
62 static bool convert_string_internal(struct smb_iconv_handle *ic,
63 charset_t from, charset_t to,
64 void const *src, size_t srclen,
65 void *dest, size_t destlen, size_t *converted_size)
67 size_t i_len, o_len;
68 size_t retval;
69 const char* inbuf = (const char*)src;
70 char* outbuf = (char*)dest;
71 smb_iconv_t descriptor;
73 descriptor = get_conv_handle(ic, from, to);
75 if (srclen == (size_t)-1) {
76 if (from == CH_UTF16LE || from == CH_UTF16BE) {
77 srclen = (strlen_w((const smb_ucs2_t *)src)+1) * 2;
78 } else {
79 srclen = strlen((const char *)src)+1;
84 if (descriptor == (smb_iconv_t)-1 || descriptor == (smb_iconv_t)0) {
85 errno = EINVAL;
86 return false;
89 i_len=srclen;
90 o_len=destlen;
92 retval = smb_iconv(descriptor, &inbuf, &i_len, &outbuf, &o_len);
93 *converted_size = destlen-o_len;
95 return (retval != (size_t)-1);
98 /**
99 * Convert string from one encoding to another, making error checking etc
100 * Fast path version - handles ASCII first.
102 * @param src pointer to source string (multibyte or singlebyte)
103 * @param srclen length of the source string in bytes, or -1 for nul terminated.
104 * @param dest pointer to destination string (multibyte or singlebyte)
105 * @param destlen maximal length allowed for string - *NEVER* -1.
106 * @param converted size is the number of bytes occupied in the destination
108 * @returns false and sets errno on fail, true on success.
110 * Ensure the srclen contains the terminating zero.
112 * This function has been hand-tuned to provide a fast path.
113 * Don't change unless you really know what you are doing. JRA.
116 bool convert_string_error_handle(struct smb_iconv_handle *ic,
117 charset_t from, charset_t to,
118 void const *src, size_t srclen,
119 void *dest, size_t destlen,
120 size_t *converted_size)
123 * NB. We deliberately don't do a strlen here if srclen == -1.
124 * This is very expensive over millions of calls and is taken
125 * care of in the slow path in convert_string_internal. JRA.
128 #ifdef DEVELOPER
129 SMB_ASSERT(destlen != (size_t)-1);
130 #endif
132 if (srclen == 0) {
133 *converted_size = 0;
134 return true;
137 if (from != CH_UTF16LE && from != CH_UTF16BE && to != CH_UTF16LE && to != CH_UTF16BE) {
138 const unsigned char *p = (const unsigned char *)src;
139 unsigned char *q = (unsigned char *)dest;
140 size_t slen = srclen;
141 size_t dlen = destlen;
142 unsigned char lastp = '\0';
143 size_t retval = 0;
145 /* If all characters are ascii, fast path here. */
146 while (slen && dlen) {
147 if ((lastp = *p) <= 0x7f) {
148 *q++ = *p++;
149 if (slen != (size_t)-1) {
150 slen--;
152 dlen--;
153 retval++;
154 if (!lastp)
155 break;
156 } else {
157 #ifdef BROKEN_UNICODE_COMPOSE_CHARACTERS
158 goto general_case;
159 #else
160 bool ret = convert_string_internal(ic, from, to, p, slen, q, dlen, converted_size);
161 *converted_size += retval;
162 return ret;
163 #endif
167 *converted_size = retval;
169 if (!dlen) {
170 /* Even if we fast path we should note if we ran out of room. */
171 if (((slen != (size_t)-1) && slen) ||
172 ((slen == (size_t)-1) && lastp)) {
173 errno = E2BIG;
174 return false;
177 return true;
178 } else if (from == CH_UTF16LE && to != CH_UTF16LE) {
179 const unsigned char *p = (const unsigned char *)src;
180 unsigned char *q = (unsigned char *)dest;
181 size_t retval = 0;
182 size_t slen = srclen;
183 size_t dlen = destlen;
184 unsigned char lastp = '\0';
185 #ifndef BROKEN_UNICODE_COMPOSE_CHARACTERS
186 bool ret;
187 #endif
189 if (slen == (size_t)-1) {
190 while (dlen &&
191 ((lastp = *p) <= 0x7f) && (p[1] == 0)) {
192 *q++ = *p;
193 p += 2;
194 dlen--;
195 retval++;
196 if (!lastp)
197 break;
199 if (lastp != 0) goto slow_path;
200 } else {
201 while (slen >= 2 && dlen &&
202 (*p <= 0x7f) && (p[1] == 0)) {
203 *q++ = *p;
204 slen -= 2;
205 p += 2;
206 dlen--;
207 retval++;
209 if (slen != 0) goto slow_path;
212 *converted_size = retval;
214 if (!dlen) {
215 /* Even if we fast path we should note if we ran out of room. */
216 if (((slen != (size_t)-1) && slen) ||
217 ((slen == (size_t)-1) && lastp)) {
218 errno = E2BIG;
219 return false;
222 return true;
224 slow_path:
225 /* come here when we hit a character we can't deal
226 * with in the fast path
228 #ifdef BROKEN_UNICODE_COMPOSE_CHARACTERS
229 goto general_case;
230 #else
231 ret = convert_string_internal(ic, from, to, p, slen, q, dlen, converted_size);
232 *converted_size += retval;
233 return ret;
234 #endif
236 } else if (from != CH_UTF16LE && from != CH_UTF16BE && to == CH_UTF16LE) {
237 const unsigned char *p = (const unsigned char *)src;
238 unsigned char *q = (unsigned char *)dest;
239 size_t retval = 0;
240 size_t slen = srclen;
241 size_t dlen = destlen;
242 unsigned char lastp = '\0';
244 /* If all characters are ascii, fast path here. */
245 while (slen && (dlen >= 1)) {
246 if (dlen >=2 && (lastp = *p) <= 0x7F) {
247 *q++ = *p++;
248 *q++ = '\0';
249 if (slen != (size_t)-1) {
250 slen--;
252 dlen -= 2;
253 retval += 2;
254 if (!lastp)
255 break;
256 } else {
257 #ifdef BROKEN_UNICODE_COMPOSE_CHARACTERS
258 goto general_case;
259 #else
260 bool ret = convert_string_internal(ic, from, to, p, slen, q, dlen, converted_size);
261 *converted_size += retval;
262 return ret;
263 #endif
267 *converted_size = retval;
269 if (!dlen) {
270 /* Even if we fast path we should note if we ran out of room. */
271 if (((slen != (size_t)-1) && slen) ||
272 ((slen == (size_t)-1) && lastp)) {
273 errno = E2BIG;
274 return false;
277 return true;
280 #ifdef BROKEN_UNICODE_COMPOSE_CHARACTERS
281 general_case:
282 #endif
283 return convert_string_internal(ic, from, to, src, srclen, dest, destlen, converted_size);
286 bool convert_string_handle(struct smb_iconv_handle *ic,
287 charset_t from, charset_t to,
288 void const *src, size_t srclen,
289 void *dest, size_t destlen,
290 size_t *converted_size)
292 bool ret = convert_string_error_handle(ic, from, to, src, srclen, dest, destlen, converted_size);
294 if(ret==false) {
295 const char *reason="unknown error";
296 switch(errno) {
297 case EINVAL:
298 reason="Incomplete multibyte sequence";
299 DBG_NOTICE("Conversion error: %s\n",
300 reason);
301 break;
302 case E2BIG:
304 reason="No more room";
305 if (from == CH_UNIX) {
306 DBG_NOTICE("E2BIG: convert_string(%s,%s): srclen=%u destlen=%u error: %s\n",
307 charset_name(ic, from), charset_name(ic, to),
308 (unsigned int)srclen, (unsigned int)destlen, reason);
309 } else {
310 DBG_NOTICE("E2BIG: convert_string(%s,%s): srclen=%u destlen=%u error: %s\n",
311 charset_name(ic, from), charset_name(ic, to),
312 (unsigned int)srclen, (unsigned int)destlen, reason);
314 break;
316 case EILSEQ:
317 reason="Illegal multibyte sequence";
318 DBG_NOTICE("convert_string_internal: Conversion error: %s\n",
319 reason);
320 break;
321 default:
322 DBG_ERR("convert_string_internal: Conversion error: %s\n",
323 reason);
324 break;
326 /* smb_panic(reason); */
328 return ret;
333 * Convert between character sets, allocating a new buffer using talloc for the result.
335 * @param srclen length of source buffer.
336 * @param dest always set at least to NULL
337 * @param converted_size set to the number of bytes occupied by the string in
338 * the destination on success.
339 * @note -1 is not accepted for srclen.
341 * @return true if new buffer was correctly allocated, and string was
342 * converted.
344 * Ensure the srclen contains the terminating zero.
346 bool convert_string_talloc_handle(TALLOC_CTX *ctx, struct smb_iconv_handle *ic,
347 charset_t from, charset_t to,
348 void const *src, size_t srclen, void *dst,
349 size_t *converted_size)
352 size_t i_len, o_len, destlen;
353 size_t retval;
354 const char *inbuf = NULL;
355 char *outbuf = NULL, *ob = NULL;
356 smb_iconv_t descriptor;
357 void **dest = dst;
359 *dest = NULL;
360 if (converted_size != NULL) {
361 *converted_size = 0;
364 if (src == NULL || srclen == (size_t)-1) {
365 errno = EINVAL;
366 return false;
369 if (srclen == 0) {
370 /* We really should treat this as an error, but
371 there are too many callers that need this to
372 return a NULL terminated string in the correct
373 character set. */
374 if (to == CH_UTF16LE|| to == CH_UTF16BE || to == CH_UTF16MUNGED) {
375 destlen = 2;
376 } else {
377 destlen = 1;
379 ob = talloc_zero_array(ctx, char, destlen);
380 if (ob == NULL) {
381 DBG_ERR("Could not talloc destination buffer.\n");
382 errno = ENOMEM;
383 return false;
385 if (converted_size != NULL) {
386 *converted_size = destlen;
388 *dest = ob;
389 return true;
392 descriptor = get_conv_handle(ic, from, to);
394 if (descriptor == (smb_iconv_t)-1 || descriptor == (smb_iconv_t)0) {
395 DEBUG(0,("convert_string_talloc: Conversion not supported.\n"));
396 errno = EOPNOTSUPP;
397 return false;
400 if (srclen >= (SIZE_MAX - 2) / 3) {
401 DBG_ERR("convert_string_talloc: "
402 "srclen is %zu, destlen would wrap!\n",
403 srclen);
404 errno = EOPNOTSUPP;
405 return false;
407 destlen = srclen * 3;
409 /* +2 is for ucs2 null termination. */
410 ob = talloc_realloc(ctx, ob, char, destlen + 2);
412 if (!ob) {
413 DEBUG(0, ("convert_string_talloc: realloc failed!\n"));
414 errno = ENOMEM;
415 return false;
417 outbuf = ob;
418 i_len = srclen;
419 o_len = destlen;
420 inbuf = (const char *)src;
422 retval = smb_iconv(descriptor,
423 &inbuf, &i_len,
424 &outbuf, &o_len);
425 if(retval == (size_t)-1) {
426 const char *reason="unknown error";
427 switch(errno) {
428 case EINVAL:
429 reason="Incomplete multibyte sequence";
430 DBG_NOTICE("Conversion error: %s\n",
431 reason);
432 break;
433 case E2BIG:
434 reason = "output buffer is too small";
435 DBG_ERR("Conversion error: %s\n",
436 reason);
437 break;
438 case EILSEQ:
439 reason="Illegal multibyte sequence";
440 DBG_NOTICE("Conversion error: %s\n",
441 reason);
442 break;
443 default:
444 DBG_ERR("Conversion error: %s\n",
445 reason);
446 break;
448 /* smb_panic(reason); */
449 TALLOC_FREE(ob);
450 return false;
453 destlen = destlen - o_len;
454 /* Don't shrink unless we're reclaiming a lot of
455 * space. This is in the hot codepath and these
456 * reallocs *cost*. JRA.
458 if (o_len > 1024) {
459 /* We're shrinking here so we know the +2 is safe from wrap. */
460 ob = talloc_realloc(ctx,ob, char, destlen + 2);
463 if (destlen && !ob) {
464 DEBUG(0, ("convert_string_talloc: out of memory!\n"));
465 errno = ENOMEM;
466 return false;
469 *dest = ob;
471 /* Must ucs2 null terminate in the extra space we allocated. */
472 ob[destlen] = '\0';
473 ob[destlen+1] = '\0';
475 /* Ensure we can never return a *converted_size of zero. */
476 if (destlen == 0) {
477 /* As we're now returning false on a bad smb_iconv call,
478 this should never happen. But be safe anyway. */
479 if (to == CH_UTF16LE|| to == CH_UTF16BE || to == CH_UTF16MUNGED) {
480 destlen = 2;
481 } else {
482 destlen = 1;
486 if (converted_size != NULL) {
487 *converted_size = destlen;
489 return true;
493 * Convert string from one encoding to another, with error checking.
494 * This version produces more logging information than
495 * convert_string_error(), but is otherwise functionally identical.
497 * @param src pointer to source string (multibyte or singlebyte)
498 * @param srclen length of the source string in bytes
499 * @param dest pointer to destination string (multibyte or singlebyte)
500 * @param destlen maximal length allowed for string
501 * @param converted_size the number of bytes occupied in the destination
503 * @returns true on success, false on fail.
505 _PUBLIC_ bool convert_string(charset_t from, charset_t to,
506 void const *src, size_t srclen,
507 void *dest, size_t destlen,
508 size_t *converted_size)
510 return convert_string_handle(get_iconv_handle(), from, to,
511 src, srclen,
512 dest, destlen, converted_size);
516 * Convert string from one encoding to another, with error checking.
517 * This version is less verbose than convert_string().
519 * @param src pointer to source string (multibyte or singlebyte)
520 * @param srclen length of the source string in bytes
521 * @param dest pointer to destination string (multibyte or singlebyte)
522 * @param destlen maximal length allowed for string
523 * @param converted_size the number of bytes occupied in the destination
525 * @returns true on success, false on fail.
527 _PUBLIC_ bool convert_string_error(charset_t from, charset_t to,
528 void const *src, size_t srclen,
529 void *dest, size_t destlen,
530 size_t *converted_size)
532 return convert_string_error_handle(get_iconv_handle(), from, to,
533 src, srclen,
534 dest, destlen, converted_size);
538 * Convert between character sets, allocating a new buffer using talloc for the result.
540 * @param srclen length of source buffer.
541 * @param dest always set at least to NULL
542 * @param converted_size Size in bytes of the converted string
543 * @note -1 is not accepted for srclen.
545 * @returns boolean indication whether the conversion succeeded
548 _PUBLIC_ bool convert_string_talloc(TALLOC_CTX *ctx,
549 charset_t from, charset_t to,
550 void const *src, size_t srclen,
551 void *dest, size_t *converted_size)
553 return convert_string_talloc_handle(ctx, get_iconv_handle(),
554 from, to, src, srclen, dest,
555 converted_size);