smbd: Ignore twrp in chdir_below_conn()
[samba4-gss.git] / lib / util / charset / util_unistr.c
blob8812011050bd28d8cbdc5ea650477fae8ac0be93
1 /*
2 Unix SMB/CIFS implementation.
3 Samba utility functions
4 Copyright (C) Andrew Tridgell 1992-2001
5 Copyright (C) Simo Sorce 2001
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program. If not, see <http://www.gnu.org/licenses/>.
21 #include "replace.h"
22 #include "system/locale.h"
23 #include "charset.h"
24 #include "lib/util/byteorder.h"
25 #include "lib/util/fault.h"
26 #include "lib/util/tsort.h"
28 /**
29 String replace.
30 NOTE: oldc and newc must be 7 bit characters
31 **/
32 _PUBLIC_ void string_replace_m(char *s, char oldc, char newc)
34 struct smb_iconv_handle *ic = get_iconv_handle();
35 while (s && *s) {
36 size_t size;
37 codepoint_t c = next_codepoint_handle(ic, s, &size);
38 if (c == oldc) {
39 *s = newc;
41 s += size;
45 /**
46 Convert a string to lower case, allocated with talloc
47 **/
48 _PUBLIC_ char *strlower_talloc_handle(struct smb_iconv_handle *iconv_handle,
49 TALLOC_CTX *ctx, const char *src)
51 size_t size=0;
52 char *dest;
54 if(src == NULL) {
55 return NULL;
58 /* this takes advantage of the fact that upper/lower can't
59 change the length of a character by more than 1 byte */
60 dest = talloc_array(ctx, char, 2*(strlen(src))+1);
61 if (dest == NULL) {
62 return NULL;
65 while (*src) {
66 size_t c_size;
67 codepoint_t c = next_codepoint_handle(iconv_handle, src, &c_size);
68 src += c_size;
70 c = tolower_m(c);
72 c_size = push_codepoint_handle(iconv_handle, dest+size, c);
73 if (c_size == -1) {
74 talloc_free(dest);
75 return NULL;
77 size += c_size;
80 dest[size] = 0;
82 /* trim it so talloc_append_string() works */
83 dest = talloc_realloc(ctx, dest, char, size+1);
85 talloc_set_name_const(dest, dest);
87 return dest;
90 _PUBLIC_ char *strlower_talloc(TALLOC_CTX *ctx, const char *src)
92 struct smb_iconv_handle *iconv_handle = get_iconv_handle();
93 return strlower_talloc_handle(iconv_handle, ctx, src);
96 /**
97 Convert a string to UPPER case, allocated with talloc
98 source length limited to n bytes, iconv handle supplied
99 **/
100 _PUBLIC_ char *strupper_talloc_n_handle(struct smb_iconv_handle *iconv_handle,
101 TALLOC_CTX *ctx, const char *src, size_t n)
103 size_t size=0;
104 char *dest;
106 if (!src) {
107 return NULL;
110 /* this takes advantage of the fact that upper/lower can't
111 change the length of a character by more than 1 byte */
112 dest = talloc_array(ctx, char, 2*(n+1));
113 if (dest == NULL) {
114 return NULL;
117 while (n && *src) {
118 size_t c_size;
119 codepoint_t c = next_codepoint_handle_ext(iconv_handle, src, n,
120 CH_UNIX, &c_size);
121 src += c_size;
122 n -= c_size;
124 c = toupper_m(c);
126 c_size = push_codepoint_handle(iconv_handle, dest+size, c);
127 if (c_size == -1) {
128 talloc_free(dest);
129 return NULL;
131 size += c_size;
134 dest[size] = 0;
136 /* trim it so talloc_append_string() works */
137 dest = talloc_realloc(ctx, dest, char, size+1);
139 talloc_set_name_const(dest, dest);
141 return dest;
145 Convert a string to UPPER case, allocated with talloc
146 source length limited to n bytes
148 _PUBLIC_ char *strupper_talloc_n(TALLOC_CTX *ctx, const char *src, size_t n)
150 struct smb_iconv_handle *iconv_handle = get_iconv_handle();
151 return strupper_talloc_n_handle(iconv_handle, ctx, src, n);
154 Convert a string to UPPER case, allocated with talloc
156 _PUBLIC_ char *strupper_talloc(TALLOC_CTX *ctx, const char *src)
158 return strupper_talloc_n(ctx, src, src?strlen(src):0);
162 talloc_strdup() a unix string to upper case.
164 _PUBLIC_ char *talloc_strdup_upper(TALLOC_CTX *ctx, const char *src)
166 return strupper_talloc(ctx, src);
171 * strncasecmp_ldb() works like a *bit* like strncasecmp, with various
172 * tricks to suit the way LDB compares strings. The differences are:
174 * 0. each string has it's own length.
176 * 1. consecutive spaces are collapsed down to one space, so that
177 * "a b" equals "a b". (this is why each string needs its own
178 * length). Leading and trailing spaces are removed altogether.
180 * 2. Comparisons are done in UPPER CASE, as Windows does, not in
181 * lowercase as POSIX would have it.
183 * 3. An invalid byte compares higher than any real character. For example,
184 * "hello\xc2\xff" would sort higher than "hello\xcd\xb6", because CD
185 * B6 is a valid sequence and C2 FF is not.
187 * 4. If two strings become invalid on the same character, the rest
188 * of the string is compared via ldb ASCII case fold rules.
190 * For example, "hellō\xC2\xFFworld" < " hElLŌ\xFE ", because the
191 * strings are equal up to 'ō' by utf-8 casefold, but the "\xc2\xff" and
192 * "\xfe" are invalid sequences. At that point, we skip to the byte-by-byte
193 * (but space-eating, casefolding) comparison, and 0xc2 < 0xff.
196 #define EAT_SPACE(s, len, ends_in_space) \
197 do { \
198 while (len) { \
199 if (*s != ' ') { \
200 break; \
202 s++; \
203 len--; \
205 ends_in_space = (len == 0 || *s == '\0'); \
206 } while(0)
209 _PUBLIC_ int strncasecmp_ldb(const char *s1,
210 size_t len1,
211 const char *s2,
212 size_t len2)
214 struct smb_iconv_handle *iconv_handle = get_iconv_handle();
215 codepoint_t c1, c2;
216 size_t cs1, cs2;
217 bool ends_in_space1, ends_in_space2;
218 int ret;
219 bool end1, end2;
221 EAT_SPACE(s1, len1, ends_in_space1);
222 EAT_SPACE(s2, len2, ends_in_space2);
224 * if ends_in_space was set, the string was empty or only
225 * spaces (which we treat as equivalent).
227 if (ends_in_space1 && ends_in_space2) {
228 return 0;
230 if (ends_in_space1) {
231 return -1;
233 if (ends_in_space2) {
234 return 1;
237 while (true) {
239 * If the next byte is a space, we eat all the spaces,
240 * and say we found a single codepoint. If the spaces
241 * were at the end of the string, the codepoint is 0,
242 * as if there were no spaces. Otherwise it is 0x20,
243 * as if there was one space.
245 * Setting the codepoint to 0 will break the loop, but
246 * only after codepoints have been found in both strings.
248 if (len1 == 0 || *s1 == 0) {
249 c1 = 0;
250 } else if (*s1 == ' ') {
251 EAT_SPACE(s1, len1, ends_in_space1);
252 c1 = ends_in_space1 ? 0 : ' ';
253 } else if ((*s1 & 0x80) == 0) {
254 c1 = *s1;
255 s1++;
256 len1--;
257 } else {
258 c1 = next_codepoint_handle_ext(iconv_handle, s1, len1,
259 CH_UNIX, &cs1);
260 if (c1 != INVALID_CODEPOINT) {
261 s1 += cs1;
262 len1 -= cs1;
266 if (len2 == 0 || *s2 == 0) {
267 c2 = 0;
268 } else if (*s2 == ' ') {
269 EAT_SPACE(s2, len2, ends_in_space2);
270 c2 = ends_in_space2 ? 0 : ' ';
271 } else if ((*s2 & 0x80) == 0) {
272 c2 = *s2;
273 s2++;
274 len2--;
275 } else {
276 c2 = next_codepoint_handle_ext(iconv_handle, s2, len2,
277 CH_UNIX, &cs2);
278 if (c2 != INVALID_CODEPOINT) {
279 s2 += cs2;
280 len2 -= cs2;
284 if (c1 == 0 || c2 == 0 ||
285 c1 == INVALID_CODEPOINT || c2 == INVALID_CODEPOINT) {
286 break;
289 if (c1 == c2) {
290 continue;
292 c1 = toupper_m(c1);
293 c2 = toupper_m(c2);
294 if (c1 != c2) {
295 break;
300 * Either a difference has been found, or one or both strings have
301 * ended or hit invalid codepoints.
303 ret = NUMERIC_CMP(c1, c2);
305 if (ret != 0) {
306 return ret;
309 * the strings are equal up to here, but one might be longer.
311 end1 = len1 == 0 || *s1 == 0;
312 end2 = len2 == 0 || *s2 == 0;
314 if (end1 && end2) {
315 return 0;
317 if (end1) {
318 return -1;
320 if (end2) {
321 return -1;
325 * By elimination, if we got here, we have INVALID_CODEPOINT on both
326 * sides.
328 * THere is no perfect option, but what we choose to do is continue on
329 * with ascii case fold (as if calling ldb_comparison_fold_ascii()
330 * which is private to ldb, so we can't just defer to it).
332 while (true) {
333 if (len1 == 0 || *s1 == 0) {
334 c1 = 0;
335 } else if (*s1 == ' ') {
336 EAT_SPACE(s1, len1, ends_in_space1);
337 c1 = ends_in_space1 ? 0 : ' ';
338 } else {
339 c1 = *s1;
340 s1++;
341 len1--;
342 c1 = ('a' <= c1 && c1 <= 'z') ? c1 ^ 0x20 : c1;
345 if (len2 == 0 || *s2 == 0) {
346 c2 = 0;
347 } else if (*s2 == ' ') {
348 EAT_SPACE(s2, len2, ends_in_space2);
349 c2 = ends_in_space2 ? 0 : ' ';
350 } else {
351 c2 = *s2;
352 s2++;
353 len2--;
354 c2 = ('a' <= c2 && c2 <= 'z') ? c2 ^ 0x20 : c2;
357 if (c1 == 0 || c2 == 0 || c1 != c2) {
358 break;
361 return NUMERIC_CMP(c1, c2);
364 #undef EAT_SPACE
368 Find the number of 'c' chars in a string
370 _PUBLIC_ size_t count_chars_m(const char *s, char c)
372 struct smb_iconv_handle *ic = get_iconv_handle();
373 size_t count = 0;
375 while (*s) {
376 size_t size;
377 codepoint_t c2 = next_codepoint_handle(ic, s, &size);
378 if (c2 == c) count++;
379 s += size;
382 return count;
385 size_t ucs2_align(const void *base_ptr, const void *p, int flags)
387 if (flags & (STR_NOALIGN|STR_ASCII)) {
388 return 0;
390 return PTR_DIFF(p, base_ptr) & 1;
394 return the number of bytes occupied by a buffer in CH_UTF16 format
396 size_t utf16_len(const void *buf)
398 size_t len;
400 for (len = 0; PULL_LE_U16(buf,len); len += 2) ;
402 return len;
406 return the number of bytes occupied by a buffer in CH_UTF16 format
407 the result includes the null termination
409 size_t utf16_null_terminated_len(const void *buf)
411 return utf16_len(buf) + 2;
415 return the number of bytes occupied by a buffer in CH_UTF16 format
416 limited by 'n' bytes
418 size_t utf16_len_n(const void *src, size_t n)
420 size_t len;
422 for (len = 0; (len+2 <= n) && PULL_LE_U16(src, len); len += 2) ;
424 return len;
428 return the number of bytes occupied by a buffer in CH_UTF16 format
429 the result includes the null termination
430 limited by 'n' bytes
432 size_t utf16_null_terminated_len_n(const void *src, size_t n)
434 size_t len;
436 len = utf16_len_n(src, n);
438 if (len+2 <= n) {
439 len += 2;
442 return len;
445 unsigned char *talloc_utf16_strlendup(TALLOC_CTX *mem_ctx, const char *str, size_t len)
447 unsigned char *new_str = NULL;
449 /* Check for overflow. */
450 if (len > SIZE_MAX - 2) {
451 return NULL;
455 * Allocate the new string, including space for the
456 * UTF‐16 null terminator.
458 new_str = talloc_size(mem_ctx, len + 2);
459 if (new_str == NULL) {
460 return NULL;
463 memcpy(new_str, str, len);
466 * Ensure that the UTF‐16 string is
467 * null‐terminated.
469 new_str[len] = '\0';
470 new_str[len + 1] = '\0';
472 return new_str;
475 unsigned char *talloc_utf16_strdup(TALLOC_CTX *mem_ctx, const char *str)
477 if (str == NULL) {
478 return NULL;
480 return talloc_utf16_strlendup(mem_ctx, str, utf16_len(str));
483 unsigned char *talloc_utf16_strndup(TALLOC_CTX *mem_ctx, const char *str, size_t n)
485 if (str == NULL) {
486 return NULL;
488 return talloc_utf16_strlendup(mem_ctx, str, utf16_len_n(str, n));
492 * Determine the length and validity of a utf-8 string.
494 * @param input the string pointer
495 * @param maxlen maximum size of the string
496 * @param byte_len receives the length of the valid section
497 * @param char_len receives the number of unicode characters in the valid section
498 * @param utf16_len receives the number of bytes the string would need in UTF16 encoding.
500 * @return true if the input is valid up to maxlen, or a '\0' byte, otherwise false.
502 bool utf8_check(const char *input, size_t maxlen,
503 size_t *byte_len,
504 size_t *char_len,
505 size_t *utf16_len)
507 const uint8_t *s = (const uint8_t *)input;
508 size_t i;
509 size_t chars = 0;
510 size_t long_chars = 0;
511 uint32_t codepoint;
512 uint8_t a, b, c, d;
513 for (i = 0; i < maxlen; i++, chars++) {
514 if (s[i] == 0) {
515 break;
517 if (s[i] < 0x80) {
518 continue;
520 if ((s[i] & 0xe0) == 0xc0) {
521 /* 110xxxxx 10xxxxxx */
522 a = s[i];
523 if (maxlen - i < 2) {
524 goto error;
526 b = s[i + 1];
527 if ((b & 0xc0) != 0x80) {
528 goto error;
530 codepoint = (a & 31) << 6 | (b & 63);
531 if (codepoint < 0x80) {
532 goto error;
534 i++;
535 continue;
537 if ((s[i] & 0xf0) == 0xe0) {
538 /* 1110xxxx 10xxxxxx 10xxxxxx */
539 if (maxlen - i < 3) {
540 goto error;
542 a = s[i];
543 b = s[i + 1];
544 c = s[i + 2];
545 if ((b & 0xc0) != 0x80 || (c & 0xc0) != 0x80) {
546 goto error;
548 codepoint = (c & 63) | (b & 63) << 6 | (a & 15) << 12;
550 if (codepoint < 0x800) {
551 goto error;
553 if (codepoint >= 0xd800 && codepoint <= 0xdfff) {
555 * This is an invalid codepoint, per
556 * RFC3629, as it encodes part of a
557 * UTF-16 surrogate pair for a
558 * character over U+10000, which ought
559 * to have been encoded as a four byte
560 * utf-8 sequence.
562 goto error;
564 i += 2;
565 continue;
568 if ((s[i] & 0xf8) == 0xf0) {
569 /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
570 if (maxlen - i < 4) {
571 goto error;
573 a = s[i];
574 b = s[i + 1];
575 c = s[i + 2];
576 d = s[i + 3];
578 if ((b & 0xc0) != 0x80 ||
579 (c & 0xc0) != 0x80 ||
580 (d & 0xc0) != 0x80) {
581 goto error;
583 codepoint = (d & 63) | (c & 63) << 6 | (b & 63) << 12 | (a & 7) << 18;
585 if (codepoint < 0x10000 || codepoint > 0x10ffff) {
586 goto error;
588 /* this one will need two UTF16 characters */
589 long_chars++;
590 i += 3;
591 continue;
594 * If it wasn't handled yet, it's wrong.
596 goto error;
598 *byte_len = i;
599 *char_len = chars;
600 *utf16_len = chars + long_chars;
601 return true;
603 error:
604 *byte_len = i;
605 *char_len = chars;
606 *utf16_len = chars + long_chars;
607 return false;
612 * Copy a string from a char* unix src to a dos codepage string destination.
614 * @converted_size the number of bytes occupied by the string in the destination.
615 * @return bool true if success.
617 * @param flags can include
618 * <dl>
619 * <dt>STR_TERMINATE</dt> <dd>means include the null termination</dd>
620 * <dt>STR_UPPER</dt> <dd>means uppercase in the destination</dd>
621 * </dl>
623 * @param dest_len the maximum length in bytes allowed in the
624 * destination. If @p dest_len is -1 then no maximum is used.
626 static bool push_ascii_string(void *dest, const char *src, size_t dest_len, int flags, size_t *converted_size)
628 size_t src_len;
629 bool ret;
631 if (flags & STR_UPPER) {
632 char *tmpbuf = strupper_talloc(NULL, src);
633 if (tmpbuf == NULL) {
634 return false;
636 ret = push_ascii_string(dest, tmpbuf, dest_len, flags & ~STR_UPPER, converted_size);
637 talloc_free(tmpbuf);
638 return ret;
641 src_len = strlen(src);
643 if (flags & (STR_TERMINATE | STR_TERMINATE_ASCII))
644 src_len++;
646 return convert_string(CH_UNIX, CH_DOS, src, src_len, dest, dest_len, converted_size);
650 * Copy a string from a dos codepage source to a unix char* destination.
652 * The resulting string in "dest" is always null terminated.
654 * @param flags can have:
655 * <dl>
656 * <dt>STR_TERMINATE</dt>
657 * <dd>STR_TERMINATE means the string in @p src
658 * is null terminated, and src_len is ignored.</dd>
659 * </dl>
661 * @param src_len is the length of the source area in bytes.
662 * @returns the number of bytes occupied by the string in @p src.
664 static ssize_t pull_ascii_string(char *dest, const void *src, size_t dest_len, size_t src_len, int flags)
666 size_t size = 0;
668 if (flags & (STR_TERMINATE | STR_TERMINATE_ASCII)) {
669 if (src_len == (size_t)-1) {
670 src_len = strlen((const char *)src) + 1;
671 } else {
672 size_t len = strnlen((const char *)src, src_len);
673 if (len < src_len)
674 len++;
675 src_len = len;
679 /* We're ignoring the return here.. */
680 (void)convert_string(CH_DOS, CH_UNIX, src, src_len, dest, dest_len, &size);
682 if (dest_len)
683 dest[MIN(size, dest_len-1)] = 0;
685 return src_len;
689 * Copy a string from a char* src to a unicode destination.
691 * @returns the number of bytes occupied by the string in the destination.
693 * @param flags can have:
695 * <dl>
696 * <dt>STR_TERMINATE <dd>means include the null termination.
697 * <dt>STR_UPPER <dd>means uppercase in the destination.
698 * <dt>STR_NOALIGN <dd>means don't do alignment.
699 * </dl>
701 * @param dest_len is the maximum length allowed in the
702 * destination. If dest_len is -1 then no maximum is used.
704 static ssize_t push_ucs2(void *dest, const char *src, size_t dest_len, int flags)
706 size_t len=0;
707 size_t src_len = strlen(src);
708 size_t size = 0;
709 bool ret;
711 if (flags & STR_UPPER) {
712 char *tmpbuf = strupper_talloc(NULL, src);
713 ssize_t retval;
714 if (tmpbuf == NULL) {
715 return -1;
717 retval = push_ucs2(dest, tmpbuf, dest_len, flags & ~STR_UPPER);
718 talloc_free(tmpbuf);
719 return retval;
722 if (flags & STR_TERMINATE)
723 src_len++;
725 if (ucs2_align(NULL, dest, flags)) {
726 *(char *)dest = 0;
727 dest = (void *)((char *)dest + 1);
728 if (dest_len) dest_len--;
729 len++;
732 /* ucs2 is always a multiple of 2 bytes */
733 dest_len &= ~1;
735 ret = convert_string(CH_UNIX, CH_UTF16, src, src_len, dest, dest_len, &size);
736 if (ret == false) {
737 return 0;
740 len += size;
742 return (ssize_t)len;
747 Copy a string from a ucs2 source to a unix char* destination.
748 Flags can have:
749 STR_TERMINATE means the string in src is null terminated.
750 STR_NOALIGN means don't try to align.
751 if STR_TERMINATE is set then src_len is ignored if it is -1.
752 src_len is the length of the source area in bytes
753 Return the number of bytes occupied by the string in src.
754 The resulting string in "dest" is always null terminated.
757 static size_t pull_ucs2(char *dest, const void *src, size_t dest_len, size_t src_len, int flags)
759 size_t size = 0;
761 if (ucs2_align(NULL, src, flags)) {
762 src = (const void *)((const char *)src + 1);
763 if (src_len > 0)
764 src_len--;
767 if (flags & STR_TERMINATE) {
768 if (src_len == (size_t)-1) {
769 src_len = utf16_null_terminated_len(src);
770 } else {
771 src_len = utf16_null_terminated_len_n(src, src_len);
775 /* ucs2 is always a multiple of 2 bytes */
776 if (src_len != (size_t)-1)
777 src_len &= ~1;
779 /* We're ignoring the return here.. */
780 (void)convert_string(CH_UTF16, CH_UNIX, src, src_len, dest, dest_len, &size);
781 if (dest_len)
782 dest[MIN(size, dest_len-1)] = 0;
784 return src_len;
788 Copy a string from a char* src to a unicode or ascii
789 dos codepage destination choosing unicode or ascii based on the
790 flags in the SMB buffer starting at base_ptr.
791 Return the number of bytes occupied by the string in the destination.
792 flags can have:
793 STR_TERMINATE means include the null termination.
794 STR_UPPER means uppercase in the destination.
795 STR_ASCII use ascii even with unicode packet.
796 STR_NOALIGN means don't do alignment.
797 dest_len is the maximum length allowed in the destination. If dest_len
798 is -1 then no maximum is used.
801 _PUBLIC_ ssize_t push_string(void *dest, const char *src, size_t dest_len, int flags)
803 if (flags & STR_ASCII) {
804 size_t size = 0;
805 if (push_ascii_string(dest, src, dest_len, flags, &size)) {
806 return (ssize_t)size;
807 } else {
808 return (ssize_t)-1;
810 } else if (flags & STR_UNICODE) {
811 return push_ucs2(dest, src, dest_len, flags);
812 } else {
813 smb_panic("push_string requires either STR_ASCII or STR_UNICODE flag to be set");
814 return -1;
820 Copy a string from a unicode or ascii source (depending on
821 the packet flags) to a char* destination.
822 Flags can have:
823 STR_TERMINATE means the string in src is null terminated.
824 STR_UNICODE means to force as unicode.
825 STR_ASCII use ascii even with unicode packet.
826 STR_NOALIGN means don't do alignment.
827 if STR_TERMINATE is set then src_len is ignored is it is -1
828 src_len is the length of the source area in bytes.
829 Return the number of bytes occupied by the string in src.
830 The resulting string in "dest" is always null terminated.
833 _PUBLIC_ ssize_t pull_string(char *dest, const void *src, size_t dest_len, size_t src_len, int flags)
835 if (flags & STR_ASCII) {
836 return pull_ascii_string(dest, src, dest_len, src_len, flags);
837 } else if (flags & STR_UNICODE) {
838 return pull_ucs2(dest, src, dest_len, src_len, flags);
839 } else {
840 smb_panic("pull_string requires either STR_ASCII or STR_UNICODE flag to be set");
841 return -1;