2 Unix SMB/CIFS implementation.
3 Samba utility functions
4 Copyright (C) Andrew Tridgell 1992-2001
5 Copyright (C) Simo Sorce 2001
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program. If not, see <http://www.gnu.org/licenses/>.
22 #include "system/locale.h"
24 #include "lib/util/byteorder.h"
25 #include "lib/util/fault.h"
26 #include "lib/util/tsort.h"
30 NOTE: oldc and newc must be 7 bit characters
32 _PUBLIC_
void string_replace_m(char *s
, char oldc
, char newc
)
34 struct smb_iconv_handle
*ic
= get_iconv_handle();
37 codepoint_t c
= next_codepoint_handle(ic
, s
, &size
);
46 Convert a string to lower case, allocated with talloc
48 _PUBLIC_
char *strlower_talloc_handle(struct smb_iconv_handle
*iconv_handle
,
49 TALLOC_CTX
*ctx
, const char *src
)
58 /* this takes advantage of the fact that upper/lower can't
59 change the length of a character by more than 1 byte */
60 dest
= talloc_array(ctx
, char, 2*(strlen(src
))+1);
67 codepoint_t c
= next_codepoint_handle(iconv_handle
, src
, &c_size
);
72 c_size
= push_codepoint_handle(iconv_handle
, dest
+size
, c
);
82 /* trim it so talloc_append_string() works */
83 dest
= talloc_realloc(ctx
, dest
, char, size
+1);
85 talloc_set_name_const(dest
, dest
);
90 _PUBLIC_
char *strlower_talloc(TALLOC_CTX
*ctx
, const char *src
)
92 struct smb_iconv_handle
*iconv_handle
= get_iconv_handle();
93 return strlower_talloc_handle(iconv_handle
, ctx
, src
);
97 Convert a string to UPPER case, allocated with talloc
98 source length limited to n bytes, iconv handle supplied
100 _PUBLIC_
char *strupper_talloc_n_handle(struct smb_iconv_handle
*iconv_handle
,
101 TALLOC_CTX
*ctx
, const char *src
, size_t n
)
110 /* this takes advantage of the fact that upper/lower can't
111 change the length of a character by more than 1 byte */
112 dest
= talloc_array(ctx
, char, 2*(n
+1));
119 codepoint_t c
= next_codepoint_handle_ext(iconv_handle
, src
, n
,
126 c_size
= push_codepoint_handle(iconv_handle
, dest
+size
, c
);
136 /* trim it so talloc_append_string() works */
137 dest
= talloc_realloc(ctx
, dest
, char, size
+1);
139 talloc_set_name_const(dest
, dest
);
145 Convert a string to UPPER case, allocated with talloc
146 source length limited to n bytes
148 _PUBLIC_
char *strupper_talloc_n(TALLOC_CTX
*ctx
, const char *src
, size_t n
)
150 struct smb_iconv_handle
*iconv_handle
= get_iconv_handle();
151 return strupper_talloc_n_handle(iconv_handle
, ctx
, src
, n
);
154 Convert a string to UPPER case, allocated with talloc
156 _PUBLIC_
char *strupper_talloc(TALLOC_CTX
*ctx
, const char *src
)
158 return strupper_talloc_n(ctx
, src
, src
?strlen(src
):0);
162 talloc_strdup() a unix string to upper case.
164 _PUBLIC_
char *talloc_strdup_upper(TALLOC_CTX
*ctx
, const char *src
)
166 return strupper_talloc(ctx
, src
);
171 * strncasecmp_ldb() works like a *bit* like strncasecmp, with various
172 * tricks to suit the way LDB compares strings. The differences are:
174 * 0. each string has it's own length.
176 * 1. consecutive spaces are collapsed down to one space, so that
177 * "a b" equals "a b". (this is why each string needs its own
178 * length). Leading and trailing spaces are removed altogether.
180 * 2. Comparisons are done in UPPER CASE, as Windows does, not in
181 * lowercase as POSIX would have it.
183 * 3. An invalid byte compares higher than any real character. For example,
184 * "hello\xc2\xff" would sort higher than "hello\xcd\xb6", because CD
185 * B6 is a valid sequence and C2 FF is not.
187 * 4. If two strings become invalid on the same character, the rest
188 * of the string is compared via ldb ASCII case fold rules.
190 * For example, "hellō\xC2\xFFworld" < " hElLŌ\xFE ", because the
191 * strings are equal up to 'ō' by utf-8 casefold, but the "\xc2\xff" and
192 * "\xfe" are invalid sequences. At that point, we skip to the byte-by-byte
193 * (but space-eating, casefolding) comparison, and 0xc2 < 0xff.
196 #define EAT_SPACE(s, len, ends_in_space) \
205 ends_in_space = (len == 0 || *s == '\0'); \
209 _PUBLIC_
int strncasecmp_ldb(const char *s1
,
214 struct smb_iconv_handle
*iconv_handle
= get_iconv_handle();
217 bool ends_in_space1
, ends_in_space2
;
221 EAT_SPACE(s1
, len1
, ends_in_space1
);
222 EAT_SPACE(s2
, len2
, ends_in_space2
);
224 * if ends_in_space was set, the string was empty or only
225 * spaces (which we treat as equivalent).
227 if (ends_in_space1
&& ends_in_space2
) {
230 if (ends_in_space1
) {
233 if (ends_in_space2
) {
239 * If the next byte is a space, we eat all the spaces,
240 * and say we found a single codepoint. If the spaces
241 * were at the end of the string, the codepoint is 0,
242 * as if there were no spaces. Otherwise it is 0x20,
243 * as if there was one space.
245 * Setting the codepoint to 0 will break the loop, but
246 * only after codepoints have been found in both strings.
248 if (len1
== 0 || *s1
== 0) {
250 } else if (*s1
== ' ') {
251 EAT_SPACE(s1
, len1
, ends_in_space1
);
252 c1
= ends_in_space1
? 0 : ' ';
253 } else if ((*s1
& 0x80) == 0) {
258 c1
= next_codepoint_handle_ext(iconv_handle
, s1
, len1
,
260 if (c1
!= INVALID_CODEPOINT
) {
266 if (len2
== 0 || *s2
== 0) {
268 } else if (*s2
== ' ') {
269 EAT_SPACE(s2
, len2
, ends_in_space2
);
270 c2
= ends_in_space2
? 0 : ' ';
271 } else if ((*s2
& 0x80) == 0) {
276 c2
= next_codepoint_handle_ext(iconv_handle
, s2
, len2
,
278 if (c2
!= INVALID_CODEPOINT
) {
284 if (c1
== 0 || c2
== 0 ||
285 c1
== INVALID_CODEPOINT
|| c2
== INVALID_CODEPOINT
) {
300 * Either a difference has been found, or one or both strings have
301 * ended or hit invalid codepoints.
303 ret
= NUMERIC_CMP(c1
, c2
);
309 * the strings are equal up to here, but one might be longer.
311 end1
= len1
== 0 || *s1
== 0;
312 end2
= len2
== 0 || *s2
== 0;
325 * By elimination, if we got here, we have INVALID_CODEPOINT on both
328 * THere is no perfect option, but what we choose to do is continue on
329 * with ascii case fold (as if calling ldb_comparison_fold_ascii()
330 * which is private to ldb, so we can't just defer to it).
333 if (len1
== 0 || *s1
== 0) {
335 } else if (*s1
== ' ') {
336 EAT_SPACE(s1
, len1
, ends_in_space1
);
337 c1
= ends_in_space1
? 0 : ' ';
342 c1
= ('a' <= c1
&& c1
<= 'z') ? c1
^ 0x20 : c1
;
345 if (len2
== 0 || *s2
== 0) {
347 } else if (*s2
== ' ') {
348 EAT_SPACE(s2
, len2
, ends_in_space2
);
349 c2
= ends_in_space2
? 0 : ' ';
354 c2
= ('a' <= c2
&& c2
<= 'z') ? c2
^ 0x20 : c2
;
357 if (c1
== 0 || c2
== 0 || c1
!= c2
) {
361 return NUMERIC_CMP(c1
, c2
);
368 Find the number of 'c' chars in a string
370 _PUBLIC_
size_t count_chars_m(const char *s
, char c
)
372 struct smb_iconv_handle
*ic
= get_iconv_handle();
377 codepoint_t c2
= next_codepoint_handle(ic
, s
, &size
);
378 if (c2
== c
) count
++;
385 size_t ucs2_align(const void *base_ptr
, const void *p
, int flags
)
387 if (flags
& (STR_NOALIGN
|STR_ASCII
)) {
390 return PTR_DIFF(p
, base_ptr
) & 1;
394 return the number of bytes occupied by a buffer in CH_UTF16 format
396 size_t utf16_len(const void *buf
)
400 for (len
= 0; PULL_LE_U16(buf
,len
); len
+= 2) ;
406 return the number of bytes occupied by a buffer in CH_UTF16 format
407 the result includes the null termination
409 size_t utf16_null_terminated_len(const void *buf
)
411 return utf16_len(buf
) + 2;
415 return the number of bytes occupied by a buffer in CH_UTF16 format
418 size_t utf16_len_n(const void *src
, size_t n
)
422 for (len
= 0; (len
+2 <= n
) && PULL_LE_U16(src
, len
); len
+= 2) ;
428 return the number of bytes occupied by a buffer in CH_UTF16 format
429 the result includes the null termination
432 size_t utf16_null_terminated_len_n(const void *src
, size_t n
)
436 len
= utf16_len_n(src
, n
);
445 unsigned char *talloc_utf16_strlendup(TALLOC_CTX
*mem_ctx
, const char *str
, size_t len
)
447 unsigned char *new_str
= NULL
;
449 /* Check for overflow. */
450 if (len
> SIZE_MAX
- 2) {
455 * Allocate the new string, including space for the
456 * UTF‐16 null terminator.
458 new_str
= talloc_size(mem_ctx
, len
+ 2);
459 if (new_str
== NULL
) {
463 memcpy(new_str
, str
, len
);
466 * Ensure that the UTF‐16 string is
470 new_str
[len
+ 1] = '\0';
475 unsigned char *talloc_utf16_strdup(TALLOC_CTX
*mem_ctx
, const char *str
)
480 return talloc_utf16_strlendup(mem_ctx
, str
, utf16_len(str
));
483 unsigned char *talloc_utf16_strndup(TALLOC_CTX
*mem_ctx
, const char *str
, size_t n
)
488 return talloc_utf16_strlendup(mem_ctx
, str
, utf16_len_n(str
, n
));
492 * Determine the length and validity of a utf-8 string.
494 * @param input the string pointer
495 * @param maxlen maximum size of the string
496 * @param byte_len receives the length of the valid section
497 * @param char_len receives the number of unicode characters in the valid section
498 * @param utf16_len receives the number of bytes the string would need in UTF16 encoding.
500 * @return true if the input is valid up to maxlen, or a '\0' byte, otherwise false.
502 bool utf8_check(const char *input
, size_t maxlen
,
507 const uint8_t *s
= (const uint8_t *)input
;
510 size_t long_chars
= 0;
513 for (i
= 0; i
< maxlen
; i
++, chars
++) {
520 if ((s
[i
] & 0xe0) == 0xc0) {
521 /* 110xxxxx 10xxxxxx */
523 if (maxlen
- i
< 2) {
527 if ((b
& 0xc0) != 0x80) {
530 codepoint
= (a
& 31) << 6 | (b
& 63);
531 if (codepoint
< 0x80) {
537 if ((s
[i
] & 0xf0) == 0xe0) {
538 /* 1110xxxx 10xxxxxx 10xxxxxx */
539 if (maxlen
- i
< 3) {
545 if ((b
& 0xc0) != 0x80 || (c
& 0xc0) != 0x80) {
548 codepoint
= (c
& 63) | (b
& 63) << 6 | (a
& 15) << 12;
550 if (codepoint
< 0x800) {
553 if (codepoint
>= 0xd800 && codepoint
<= 0xdfff) {
555 * This is an invalid codepoint, per
556 * RFC3629, as it encodes part of a
557 * UTF-16 surrogate pair for a
558 * character over U+10000, which ought
559 * to have been encoded as a four byte
568 if ((s
[i
] & 0xf8) == 0xf0) {
569 /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
570 if (maxlen
- i
< 4) {
578 if ((b
& 0xc0) != 0x80 ||
579 (c
& 0xc0) != 0x80 ||
580 (d
& 0xc0) != 0x80) {
583 codepoint
= (d
& 63) | (c
& 63) << 6 | (b
& 63) << 12 | (a
& 7) << 18;
585 if (codepoint
< 0x10000 || codepoint
> 0x10ffff) {
588 /* this one will need two UTF16 characters */
594 * If it wasn't handled yet, it's wrong.
600 *utf16_len
= chars
+ long_chars
;
606 *utf16_len
= chars
+ long_chars
;
612 * Copy a string from a char* unix src to a dos codepage string destination.
614 * @converted_size the number of bytes occupied by the string in the destination.
615 * @return bool true if success.
617 * @param flags can include
619 * <dt>STR_TERMINATE</dt> <dd>means include the null termination</dd>
620 * <dt>STR_UPPER</dt> <dd>means uppercase in the destination</dd>
623 * @param dest_len the maximum length in bytes allowed in the
624 * destination. If @p dest_len is -1 then no maximum is used.
626 static bool push_ascii_string(void *dest
, const char *src
, size_t dest_len
, int flags
, size_t *converted_size
)
631 if (flags
& STR_UPPER
) {
632 char *tmpbuf
= strupper_talloc(NULL
, src
);
633 if (tmpbuf
== NULL
) {
636 ret
= push_ascii_string(dest
, tmpbuf
, dest_len
, flags
& ~STR_UPPER
, converted_size
);
641 src_len
= strlen(src
);
643 if (flags
& (STR_TERMINATE
| STR_TERMINATE_ASCII
))
646 return convert_string(CH_UNIX
, CH_DOS
, src
, src_len
, dest
, dest_len
, converted_size
);
650 * Copy a string from a dos codepage source to a unix char* destination.
652 * The resulting string in "dest" is always null terminated.
654 * @param flags can have:
656 * <dt>STR_TERMINATE</dt>
657 * <dd>STR_TERMINATE means the string in @p src
658 * is null terminated, and src_len is ignored.</dd>
661 * @param src_len is the length of the source area in bytes.
662 * @returns the number of bytes occupied by the string in @p src.
664 static ssize_t
pull_ascii_string(char *dest
, const void *src
, size_t dest_len
, size_t src_len
, int flags
)
668 if (flags
& (STR_TERMINATE
| STR_TERMINATE_ASCII
)) {
669 if (src_len
== (size_t)-1) {
670 src_len
= strlen((const char *)src
) + 1;
672 size_t len
= strnlen((const char *)src
, src_len
);
679 /* We're ignoring the return here.. */
680 (void)convert_string(CH_DOS
, CH_UNIX
, src
, src_len
, dest
, dest_len
, &size
);
683 dest
[MIN(size
, dest_len
-1)] = 0;
689 * Copy a string from a char* src to a unicode destination.
691 * @returns the number of bytes occupied by the string in the destination.
693 * @param flags can have:
696 * <dt>STR_TERMINATE <dd>means include the null termination.
697 * <dt>STR_UPPER <dd>means uppercase in the destination.
698 * <dt>STR_NOALIGN <dd>means don't do alignment.
701 * @param dest_len is the maximum length allowed in the
702 * destination. If dest_len is -1 then no maximum is used.
704 static ssize_t
push_ucs2(void *dest
, const char *src
, size_t dest_len
, int flags
)
707 size_t src_len
= strlen(src
);
711 if (flags
& STR_UPPER
) {
712 char *tmpbuf
= strupper_talloc(NULL
, src
);
714 if (tmpbuf
== NULL
) {
717 retval
= push_ucs2(dest
, tmpbuf
, dest_len
, flags
& ~STR_UPPER
);
722 if (flags
& STR_TERMINATE
)
725 if (ucs2_align(NULL
, dest
, flags
)) {
727 dest
= (void *)((char *)dest
+ 1);
728 if (dest_len
) dest_len
--;
732 /* ucs2 is always a multiple of 2 bytes */
735 ret
= convert_string(CH_UNIX
, CH_UTF16
, src
, src_len
, dest
, dest_len
, &size
);
747 Copy a string from a ucs2 source to a unix char* destination.
749 STR_TERMINATE means the string in src is null terminated.
750 STR_NOALIGN means don't try to align.
751 if STR_TERMINATE is set then src_len is ignored if it is -1.
752 src_len is the length of the source area in bytes
753 Return the number of bytes occupied by the string in src.
754 The resulting string in "dest" is always null terminated.
757 static size_t pull_ucs2(char *dest
, const void *src
, size_t dest_len
, size_t src_len
, int flags
)
761 if (ucs2_align(NULL
, src
, flags
)) {
762 src
= (const void *)((const char *)src
+ 1);
767 if (flags
& STR_TERMINATE
) {
768 if (src_len
== (size_t)-1) {
769 src_len
= utf16_null_terminated_len(src
);
771 src_len
= utf16_null_terminated_len_n(src
, src_len
);
775 /* ucs2 is always a multiple of 2 bytes */
776 if (src_len
!= (size_t)-1)
779 /* We're ignoring the return here.. */
780 (void)convert_string(CH_UTF16
, CH_UNIX
, src
, src_len
, dest
, dest_len
, &size
);
782 dest
[MIN(size
, dest_len
-1)] = 0;
788 Copy a string from a char* src to a unicode or ascii
789 dos codepage destination choosing unicode or ascii based on the
790 flags in the SMB buffer starting at base_ptr.
791 Return the number of bytes occupied by the string in the destination.
793 STR_TERMINATE means include the null termination.
794 STR_UPPER means uppercase in the destination.
795 STR_ASCII use ascii even with unicode packet.
796 STR_NOALIGN means don't do alignment.
797 dest_len is the maximum length allowed in the destination. If dest_len
798 is -1 then no maximum is used.
801 _PUBLIC_ ssize_t
push_string(void *dest
, const char *src
, size_t dest_len
, int flags
)
803 if (flags
& STR_ASCII
) {
805 if (push_ascii_string(dest
, src
, dest_len
, flags
, &size
)) {
806 return (ssize_t
)size
;
810 } else if (flags
& STR_UNICODE
) {
811 return push_ucs2(dest
, src
, dest_len
, flags
);
813 smb_panic("push_string requires either STR_ASCII or STR_UNICODE flag to be set");
820 Copy a string from a unicode or ascii source (depending on
821 the packet flags) to a char* destination.
823 STR_TERMINATE means the string in src is null terminated.
824 STR_UNICODE means to force as unicode.
825 STR_ASCII use ascii even with unicode packet.
826 STR_NOALIGN means don't do alignment.
827 if STR_TERMINATE is set then src_len is ignored is it is -1
828 src_len is the length of the source area in bytes.
829 Return the number of bytes occupied by the string in src.
830 The resulting string in "dest" is always null terminated.
833 _PUBLIC_ ssize_t
pull_string(char *dest
, const void *src
, size_t dest_len
, size_t src_len
, int flags
)
835 if (flags
& STR_ASCII
) {
836 return pull_ascii_string(dest
, src
, dest_len
, src_len
, flags
);
837 } else if (flags
& STR_UNICODE
) {
838 return pull_ucs2(dest
, src
, dest_len
, src_len
, flags
);
840 smb_panic("pull_string requires either STR_ASCII or STR_UNICODE flag to be set");