2 * unistr.c - Unicode string handling. Originated from the Linux-NTFS project.
4 * Copyright (c) 2000-2004 Anton Altaparmakov
5 * Copyright (c) 2002-2009 Szabolcs Szakacsits
6 * Copyright (c) 2008-2011 Jean-Pierre Andre
7 * Copyright (c) 2008 Bernhard Kaindl
9 * This program/include file is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License as published
11 * by the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * This program/include file is distributed in the hope that it will be
15 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
16 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program (in the main directory of the NTFS-3G
21 * distribution in the file COPYING); if not, write to the Free Software
22 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
48 #if defined(__APPLE__) || defined(__DARWIN__)
50 #include <CoreFoundation/CoreFoundation.h>
51 #endif /* ENABLE_NFCONV */
52 #endif /* defined(__APPLE__) || defined(__DARWIN__) */
62 #define NOREVBOM 0 /* JPA rejecting U+FFFE and U+FFFF, open to debate */
68 * All these routines assume that the Unicode characters are in little endian
69 * encoding inside the strings!!!
72 static int use_utf8
= 1; /* use UTF-8 encoding for file names */
74 #if defined(__APPLE__) || defined(__DARWIN__)
77 * This variable controls whether or not automatic normalization form conversion
78 * should be performed when translating NTFS unicode file names to UTF-8.
79 * Defaults to on, but can be controlled from the outside using the function
80 * int ntfs_macosx_normalize_filenames(int normalize);
82 static int nfconvert_utf8
= 1;
83 #endif /* ENABLE_NFCONV */
84 #endif /* defined(__APPLE__) || defined(__DARWIN__) */
87 * This is used by the name collation functions to quickly determine what
88 * characters are (in)valid.
91 static const u8 legal_ansi_char_array
[0x40] = {
92 0x00, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
93 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
95 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
96 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
98 0x17, 0x07, 0x18, 0x17, 0x17, 0x17, 0x17, 0x17,
99 0x17, 0x17, 0x18, 0x16, 0x16, 0x17, 0x07, 0x00,
101 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17,
102 0x17, 0x17, 0x04, 0x16, 0x18, 0x16, 0x18, 0x18,
107 * ntfs_names_are_equal - compare two Unicode names for equality
108 * @s1: name to compare to @s2
109 * @s1_len: length in Unicode characters of @s1
110 * @s2: name to compare to @s1
111 * @s2_len: length in Unicode characters of @s2
112 * @ic: ignore case bool
113 * @upcase: upcase table (only if @ic == IGNORE_CASE)
114 * @upcase_size: length in Unicode characters of @upcase (if present)
116 * Compare the names @s1 and @s2 and return TRUE (1) if the names are
117 * identical, or FALSE (0) if they are not identical. If @ic is IGNORE_CASE,
118 * the @upcase table is used to perform a case insensitive comparison.
120 BOOL
ntfs_names_are_equal(const ntfschar
*s1
, size_t s1_len
,
121 const ntfschar
*s2
, size_t s2_len
,
122 const IGNORE_CASE_BOOL ic
,
123 const ntfschar
*upcase
, const u32 upcase_size
)
125 if (s1_len
!= s2_len
)
129 if (ic
== CASE_SENSITIVE
)
130 return ntfs_ucsncmp(s1
, s2
, s1_len
) ? FALSE
: TRUE
;
131 return ntfs_ucsncasecmp(s1
, s2
, s1_len
, upcase
, upcase_size
) ? FALSE
:
136 * ntfs_names_full_collate() fully collate two Unicode names
138 * @name1: first Unicode name to compare
139 * @name1_len: length of first Unicode name to compare
140 * @name2: second Unicode name to compare
141 * @name2_len: length of second Unicode name to compare
142 * @ic: either CASE_SENSITIVE or IGNORE_CASE
143 * @upcase: upcase table (ignored if @ic is CASE_SENSITIVE)
144 * @upcase_len: upcase table size (ignored if @ic is CASE_SENSITIVE)
146 * -1 if the first name collates before the second one,
147 * 0 if the names match,
148 * 1 if the second name collates before the first one, or
151 int ntfs_names_full_collate(const ntfschar
*name1
, const u32 name1_len
,
152 const ntfschar
*name2
, const u32 name2_len
,
153 const IGNORE_CASE_BOOL ic
, const ntfschar
*upcase
,
154 const u32 upcase_len
)
161 if (!name1
|| !name2
|| (ic
&& (!upcase
|| !upcase_len
))) {
162 ntfs_log_debug("ntfs_names_collate received NULL pointer!\n");
166 cnt
= min(name1_len
, name2_len
);
168 if (ic
== CASE_SENSITIVE
) {
169 while (--cnt
&& (*name1
== *name2
)) {
173 u1
= c1
= le16_to_cpu(*name1
);
174 u2
= c2
= le16_to_cpu(*name2
);
176 u1
= le16_to_cpu(upcase
[u1
]);
178 u2
= le16_to_cpu(upcase
[u2
]);
179 if ((u1
== u2
) && cnt
)
182 u1
= le16_to_cpu(*name1
);
184 u2
= le16_to_cpu(*name2
);
186 u1
= le16_to_cpu(upcase
[u1
]);
188 u2
= le16_to_cpu(upcase
[u2
]);
189 } while ((u1
== u2
) && --cnt
);
194 if (name1_len
< name2_len
)
196 if (name1_len
> name2_len
)
204 u1
= c1
= le16_to_cpu(*name1
);
206 u2
= c2
= le16_to_cpu(*name2
);
209 u1
= le16_to_cpu(upcase
[u1
]);
211 u2
= le16_to_cpu(upcase
[u2
]);
212 } while ((u1
== u2
) && --cnt
);
217 if (name1_len
< name2_len
)
219 if (name1_len
> name2_len
)
223 if (name1_len
< name2_len
)
225 if (name1_len
> name2_len
)
232 * ntfs_ucsncmp - compare two little endian Unicode strings
235 * @n: maximum unicode characters to compare
237 * Compare the first @n characters of the Unicode strings @s1 and @s2,
238 * The strings in little endian format and appropriate le16_to_cpu()
239 * conversion is performed on non-little endian machines.
241 * The function returns an integer less than, equal to, or greater than zero
242 * if @s1 (or the first @n Unicode characters thereof) is found, respectively,
243 * to be less than, to match, or be greater than @s2.
245 int ntfs_ucsncmp(const ntfschar
*s1
, const ntfschar
*s2
, size_t n
)
252 ntfs_log_debug("ntfs_wcsncmp() received NULL pointer!\n");
256 for (i
= 0; i
< n
; ++i
) {
257 c1
= le16_to_cpu(s1
[i
]);
258 c2
= le16_to_cpu(s2
[i
]);
270 * ntfs_ucsncasecmp - compare two little endian Unicode strings, ignoring case
273 * @n: maximum unicode characters to compare
274 * @upcase: upcase table
275 * @upcase_size: upcase table size in Unicode characters
277 * Compare the first @n characters of the Unicode strings @s1 and @s2,
278 * ignoring case. The strings in little endian format and appropriate
279 * le16_to_cpu() conversion is performed on non-little endian machines.
281 * Each character is uppercased using the @upcase table before the comparison.
283 * The function returns an integer less than, equal to, or greater than zero
284 * if @s1 (or the first @n Unicode characters thereof) is found, respectively,
285 * to be less than, to match, or be greater than @s2.
287 int ntfs_ucsncasecmp(const ntfschar
*s1
, const ntfschar
*s2
, size_t n
,
288 const ntfschar
*upcase
, const u32 upcase_size
)
294 if (!s1
|| !s2
|| !upcase
) {
295 ntfs_log_debug("ntfs_wcsncasecmp() received NULL pointer!\n");
299 for (i
= 0; i
< n
; ++i
) {
300 if ((c1
= le16_to_cpu(s1
[i
])) < upcase_size
)
301 c1
= le16_to_cpu(upcase
[c1
]);
302 if ((c2
= le16_to_cpu(s2
[i
])) < upcase_size
)
303 c2
= le16_to_cpu(upcase
[c2
]);
315 * ntfs_ucsnlen - determine the length of a little endian Unicode string
316 * @s: pointer to Unicode string
317 * @maxlen: maximum length of string @s
319 * Return the number of Unicode characters in the little endian Unicode
320 * string @s up to a maximum of maxlen Unicode characters, not including
321 * the terminating (ntfschar)'\0'. If there is no (ntfschar)'\0' between @s
322 * and @s + @maxlen, @maxlen is returned.
324 * This function never looks beyond @s + @maxlen.
326 u32
ntfs_ucsnlen(const ntfschar
*s
, u32 maxlen
)
330 for (i
= 0; i
< maxlen
; i
++) {
331 if (!le16_to_cpu(s
[i
]))
338 * ntfs_ucsndup - duplicate little endian Unicode string
339 * @s: pointer to Unicode string
340 * @maxlen: maximum length of string @s
342 * Return a pointer to a new little endian Unicode string which is a duplicate
343 * of the string s. Memory for the new string is obtained with ntfs_malloc(3),
344 * and can be freed with free(3).
346 * A maximum of @maxlen Unicode characters are copied and a terminating
347 * (ntfschar)'\0' little endian Unicode character is added.
349 * This function never looks beyond @s + @maxlen.
351 * Return a pointer to the new little endian Unicode string on success and NULL
352 * on failure with errno set to the error code.
354 ntfschar
*ntfs_ucsndup(const ntfschar
*s
, u32 maxlen
)
359 len
= ntfs_ucsnlen(s
, maxlen
);
360 dst
= ntfs_malloc((len
+ 1) * sizeof(ntfschar
));
362 memcpy(dst
, s
, len
* sizeof(ntfschar
));
363 dst
[len
] = cpu_to_le16(L
'\0');
369 * ntfs_name_upcase - Map an Unicode name to its uppercase equivalent
379 void ntfs_name_upcase(ntfschar
*name
, u32 name_len
, const ntfschar
*upcase
,
380 const u32 upcase_len
)
385 for (i
= 0; i
< name_len
; i
++)
386 if ((u
= le16_to_cpu(name
[i
])) < upcase_len
)
391 * ntfs_name_locase - Map a Unicode name to its lowercase equivalent
393 void ntfs_name_locase(ntfschar
*name
, u32 name_len
, const ntfschar
*locase
,
394 const u32 locase_len
)
400 for (i
= 0; i
< name_len
; i
++)
401 if ((u
= le16_to_cpu(name
[i
])) < locase_len
)
406 * ntfs_file_value_upcase - Convert a filename to upper case
415 void ntfs_file_value_upcase(FILE_NAME_ATTR
*file_name_attr
,
416 const ntfschar
*upcase
, const u32 upcase_len
)
418 ntfs_name_upcase((ntfschar
*)&file_name_attr
->file_name
,
419 file_name_attr
->file_name_length
, upcase
, upcase_len
);
423 NTFS uses Unicode (UTF-16LE [NTFS-3G uses UCS-2LE, which is enough
424 for now]) for path names, but the Unicode code points need to be
425 converted before a path can be accessed under NTFS. For 7 bit ASCII/ANSI,
426 glibc does this even without a locale in a hard-coded fashion as that
427 appears to be is easy because the low 7-bit ASCII range appears to be
428 available in all charsets but it does not convert anything if
429 there was some error with the locale setup or none set up like
430 when mount is called during early boot where he (by policy) do
431 not use locales (and may be not available if /usr is not yet mounted),
432 so this patch fixes the resulting issues for systems which use
433 UTF-8 and for others, specifying the locale in fstab brings them
434 the encoding which they want.
436 If no locale is defined or there was a problem with setting one
437 up and whenever nl_langinfo(CODESET) returns a sting starting with
438 "ANSI", use an internal UCS-2LE <-> UTF-8 codeset converter to fix
439 the bug where NTFS-3G does not show any path names which include
440 international characters!!! (and also fails on creating them) as result.
442 Author: Bernhard Kaindl <bk@suse.de>
443 Jean-Pierre Andre made it compliant with RFC3629/RFC2781.
447 * Return the amount of 8-bit elements in UTF-8 needed (without the terminating
448 * null) to store a given UTF-16LE string.
450 * Return -1 with errno set if string has invalid byte sequence or too long.
452 static int utf16_to_utf8_size(const ntfschar
*ins
, const int ins_len
, int outs_len
)
459 for (i
= 0; i
< ins_len
&& ins
[i
]; i
++) {
460 unsigned short c
= le16_to_cpu(ins
[i
]);
462 if ((c
>= 0xdc00) && (c
< 0xe000)) {
477 else if ((c
>= 0xe000) && (c
< 0xfffe))
479 else if (c
>= 0xe000)
484 if (count
> outs_len
) {
485 errno
= ENAMETOOLONG
;
501 * ntfs_utf16_to_utf8 - convert a little endian UTF16LE string to an UTF-8 string
502 * @ins: input utf16 string buffer
503 * @ins_len: length of input string in utf16 characters
504 * @outs: on return contains the (allocated) output multibyte string
505 * @outs_len: length of output buffer in bytes
507 * Return -1 with errno set if string has invalid byte sequence or too long.
509 static int ntfs_utf16_to_utf8(const ntfschar
*ins
, const int ins_len
,
510 char **outs
, int outs_len
)
512 #if defined(__APPLE__) || defined(__DARWIN__)
514 char *original_outs_value
= *outs
;
515 int original_outs_len
= outs_len
;
516 #endif /* ENABLE_NFCONV */
517 #endif /* defined(__APPLE__) || defined(__DARWIN__) */
520 int i
, size
, ret
= -1;
527 size
= utf16_to_utf8_size(ins
, ins_len
, outs_len
);
534 *outs
= ntfs_malloc(outs_len
);
541 for (i
= 0; i
< ins_len
&& ins
[i
]; i
++) {
542 unsigned short c
= le16_to_cpu(ins
[i
]);
543 /* size not double-checked */
545 if ((c
>= 0xdc00) && (c
< 0xe000)) {
546 *t
++ = 0xf0 + (((halfpair
+ 64) >> 8) & 7);
547 *t
++ = 0x80 + (((halfpair
+ 64) >> 2) & 63);
548 *t
++ = 0x80 + ((c
>> 6) & 15) + ((halfpair
& 3) << 4);
549 *t
++ = 0x80 + (c
& 63);
553 } else if (c
< 0x80) {
557 *t
++ = (0xc0 | ((c
>> 6) & 0x3f));
558 *t
++ = 0x80 | (c
& 0x3f);
559 } else if (c
< 0xd800) {
560 *t
++ = 0xe0 | (c
>> 12);
561 *t
++ = 0x80 | ((c
>> 6) & 0x3f);
562 *t
++ = 0x80 | (c
& 0x3f);
563 } else if (c
< 0xdc00)
565 else if (c
>= 0xe000) {
566 *t
++ = 0xe0 | (c
>> 12);
567 *t
++ = 0x80 | ((c
>> 6) & 0x3f);
568 *t
++ = 0x80 | (c
& 0x3f);
575 #if defined(__APPLE__) || defined(__DARWIN__)
577 if(nfconvert_utf8
&& (t
- *outs
) > 0) {
578 char *new_outs
= NULL
;
579 int new_outs_len
= ntfs_macosx_normalize_utf8(*outs
, &new_outs
, 0); // Normalize to decomposed form
580 if(new_outs_len
>= 0 && new_outs
!= NULL
) {
581 if(original_outs_value
!= *outs
) {
582 // We have allocated outs ourselves.
585 t
= *outs
+ new_outs_len
;
588 // We need to copy new_outs into the fixed outs buffer.
589 memset(*outs
, 0, original_outs_len
);
590 strncpy(*outs
, new_outs
, original_outs_len
-1);
591 t
= *outs
+ original_outs_len
;
596 ntfs_log_error("Failed to normalize NTFS string to UTF-8 NFD: %s\n", *outs
);
597 ntfs_log_error(" new_outs=0x%p\n", new_outs
);
598 ntfs_log_error(" new_outs_len=%d\n", new_outs_len
);
601 #endif /* ENABLE_NFCONV */
602 #endif /* defined(__APPLE__) || defined(__DARWIN__) */
613 * Return the amount of 16-bit elements in UTF-16LE needed
614 * (without the terminating null) to store given UTF-8 string.
616 * Return -1 with errno set if it's longer than PATH_MAX or string is invalid.
618 * Note: This does not check whether the input sequence is a valid utf8 string,
619 * and should be used only in context where such check is made!
621 static int utf8_to_utf16_size(const char *s
)
627 while ((byte
= *((const unsigned char *)s
++))) {
628 if (++count
>= PATH_MAX
)
647 if (++count
>= PATH_MAX
)
656 errno
= ENAMETOOLONG
;
660 * This converts one UTF-8 sequence to cpu-endian Unicode value
661 * within range U+0 .. U+10ffff and excluding U+D800 .. U+DFFF
663 * Return the number of used utf8 bytes or -1 with errno set
664 * if sequence is invalid.
666 static int utf8_to_unicode(u32
*wc
, const char *s
)
668 unsigned int byte
= *((const unsigned char *)s
);
674 } else if (byte
< 0x80) {
678 } else if (byte
< 0xc2) {
680 } else if (byte
< 0xE0) {
681 if ((s
[1] & 0xC0) == 0x80) {
682 *wc
= ((u32
)(byte
& 0x1F) << 6)
683 | ((u32
)(s
[1] & 0x3F));
688 } else if (byte
< 0xF0) {
689 if (((s
[1] & 0xC0) == 0x80) && ((s
[2] & 0xC0) == 0x80)) {
690 *wc
= ((u32
)(byte
& 0x0F) << 12)
691 | ((u32
)(s
[1] & 0x3F) << 6)
692 | ((u32
)(s
[2] & 0x3F));
693 /* Check valid ranges */
695 if (((*wc
>= 0x800) && (*wc
<= 0xD7FF))
696 || ((*wc
>= 0xe000) && (*wc
<= 0xFFFD)))
699 if (((*wc
>= 0x800) && (*wc
<= 0xD7FF))
700 || ((*wc
>= 0xe000) && (*wc
<= 0xFFFF)))
706 } else if (byte
< 0xF5) {
707 if (((s
[1] & 0xC0) == 0x80) && ((s
[2] & 0xC0) == 0x80)
708 && ((s
[3] & 0xC0) == 0x80)) {
709 *wc
= ((u32
)(byte
& 0x07) << 18)
710 | ((u32
)(s
[1] & 0x3F) << 12)
711 | ((u32
)(s
[2] & 0x3F) << 6)
712 | ((u32
)(s
[3] & 0x3F));
713 /* Check valid ranges */
714 if ((*wc
<= 0x10ffff) && (*wc
>= 0x10000))
725 * ntfs_utf8_to_utf16 - convert a UTF-8 string to a UTF-16LE string
726 * @ins: input multibyte string buffer
727 * @outs: on return contains the (allocated) output utf16 string
728 * @outs_len: length of output buffer in utf16 characters
730 * Return -1 with errno set.
732 static int ntfs_utf8_to_utf16(const char *ins
, ntfschar
**outs
)
734 #if defined(__APPLE__) || defined(__DARWIN__)
736 char *new_ins
= NULL
;
739 new_ins_len
= ntfs_macosx_normalize_utf8(ins
, &new_ins
, 1); // Normalize to composed form
743 ntfs_log_error("Failed to normalize NTFS string to UTF-8 NFC: %s\n", ins
);
745 #endif /* ENABLE_NFCONV */
746 #endif /* defined(__APPLE__) || defined(__DARWIN__) */
751 int shorts
, ret
= -1;
753 shorts
= utf8_to_utf16_size(ins
);
759 *outs
= ntfs_malloc((shorts
+ 1) * sizeof(ntfschar
));
768 int m
= utf8_to_unicode(&wc
, t
);
771 /* do not leave space allocated if failed */
774 *outs
= (ntfschar
*)NULL
;
778 *outpos
++ = const_cpu_to_le16(0);
782 *outpos
++ = cpu_to_le16(wc
);
785 *outpos
++ = cpu_to_le16((wc
>> 10) + 0xd800);
786 *outpos
++ = cpu_to_le16((wc
& 0x3ff) + 0xdc00);
791 ret
= --outpos
- *outs
;
793 #if defined(__APPLE__) || defined(__DARWIN__)
797 #endif /* ENABLE_NFCONV */
798 #endif /* defined(__APPLE__) || defined(__DARWIN__) */
803 * ntfs_ucstombs - convert a little endian Unicode string to a multibyte string
804 * @ins: input Unicode string buffer
805 * @ins_len: length of input string in Unicode characters
806 * @outs: on return contains the (allocated) output multibyte string
807 * @outs_len: length of output buffer in bytes
809 * Convert the input little endian, 2-byte Unicode string @ins, of length
810 * @ins_len into the multibyte string format dictated by the current locale.
812 * If *@outs is NULL, the function allocates the string and the caller is
813 * responsible for calling free(*@outs); when finished with it.
815 * On success the function returns the number of bytes written to the output
816 * string *@outs (>= 0), not counting the terminating NULL byte. If the output
817 * string buffer was allocated, *@outs is set to it.
819 * On error, -1 is returned, and errno is set to the error code. The following
820 * error codes can be expected:
821 * EINVAL Invalid arguments (e.g. @ins or @outs is NULL).
822 * EILSEQ The input string cannot be represented as a multibyte
823 * sequence according to the current locale.
824 * ENAMETOOLONG Destination buffer is too small for input string.
825 * ENOMEM Not enough memory to allocate destination buffer.
827 int ntfs_ucstombs(const ntfschar
*ins
, const int ins_len
, char **outs
,
839 #endif /* MB_CUR_MAX */
847 if (mbs
&& !mbs_len
) {
848 errno
= ENAMETOOLONG
;
852 return ntfs_utf16_to_utf8(ins
, ins_len
, outs
, outs_len
);
855 mbs_len
= (ins_len
+ 1) * MB_CUR_MAX
;
856 mbs
= ntfs_malloc(mbs_len
);
861 memset(&mbstate
, 0, sizeof(mbstate
));
865 for (i
= o
= 0; i
< ins_len
; i
++) {
866 /* Reallocate memory if necessary or abort. */
867 if ((int)(o
+ MB_CUR_MAX
) > mbs_len
) {
870 errno
= ENAMETOOLONG
;
873 tc
= ntfs_malloc((mbs_len
+ 64) & ~63);
876 memcpy(tc
, mbs
, mbs_len
);
877 mbs_len
= (mbs_len
+ 64) & ~63;
881 /* Convert the LE Unicode character to a CPU wide character. */
882 wc
= (wchar_t)le16_to_cpu(ins
[i
]);
885 /* Convert the CPU endian wide character to multibyte. */
887 cnt
= wcrtomb(mbs
+ o
, wc
, &mbstate
);
889 cnt
= wctomb(mbs
+ o
, wc
);
894 ntfs_log_debug("Eeek. cnt <= 0, cnt = %i\n", cnt
);
901 /* Make sure we are back in the initial state. */
902 if (!mbsinit(&mbstate
)) {
903 ntfs_log_debug("Eeek. mbstate not in initial state!\n");
908 /* Now write the NULL character. */
919 #else /* MB_CUR_MAX */
921 #endif /* MB_CUR_MAX */
926 * ntfs_mbstoucs - convert a multibyte string to a little endian Unicode string
927 * @ins: input multibyte string buffer
928 * @outs: on return contains the (allocated) output Unicode string
930 * Convert the input multibyte string @ins, from the current locale into the
931 * corresponding little endian, 2-byte Unicode string.
933 * The function allocates the string and the caller is responsible for calling
934 * free(*@outs); when finished with it.
936 * On success the function returns the number of Unicode characters written to
937 * the output string *@outs (>= 0), not counting the terminating Unicode NULL
940 * On error, -1 is returned, and errno is set to the error code. The following
941 * error codes can be expected:
942 * EINVAL Invalid arguments (e.g. @ins or @outs is NULL).
943 * EILSEQ The input string cannot be represented as a Unicode
944 * string according to the current locale.
945 * ENAMETOOLONG Destination buffer is too small for input string.
946 * ENOMEM Not enough memory to allocate destination buffer.
948 int ntfs_mbstoucs(const char *ins
, ntfschar
**outs
)
954 int i
, o
, cnt
, ins_len
, ucs_len
, ins_size
;
958 #endif /* MB_CUR_MAX */
966 return ntfs_utf8_to_utf16(ins
, outs
);
969 /* Determine the size of the multi-byte string in bytes. */
970 ins_size
= strlen(ins
);
971 /* Determine the length of the multi-byte string. */
973 #if defined(HAVE_MBSINIT)
974 memset(&mbstate
, 0, sizeof(mbstate
));
975 ins_len
= mbsrtowcs(NULL
, (const char **)&s
, 0, &mbstate
);
977 if (!ins_len
&& *ins
) {
978 /* Older Cygwin had broken mbsrtowcs() implementation. */
979 ins_len
= strlen(ins
);
982 #elif !defined(DJGPP)
983 ins_len
= mbstowcs(NULL
, s
, 0);
985 /* Eeek!!! DJGPP has broken mbstowcs() implementation!!! */
986 ins_len
= strlen(ins
);
991 if ((s
!= ins
) || !mbsinit(&mbstate
)) {
998 /* Add the NULL terminator. */
1001 ucs
= ntfs_malloc(ucs_len
* sizeof(ntfschar
));
1005 memset(&mbstate
, 0, sizeof(mbstate
));
1007 mbtowc(NULL
, NULL
, 0);
1009 for (i
= o
= cnt
= 0; i
< ins_size
; i
+= cnt
, o
++) {
1010 /* Reallocate memory if necessary. */
1013 ucs_len
= (ucs_len
* sizeof(ntfschar
) + 64) & ~63;
1014 tc
= realloc(ucs
, ucs_len
);
1018 ucs_len
/= sizeof(ntfschar
);
1020 /* Convert the multibyte character to a wide character. */
1022 cnt
= mbrtowc(&wc
, ins
+ i
, ins_size
- i
, &mbstate
);
1024 cnt
= mbtowc(&wc
, ins
+ i
, ins_size
- i
);
1031 ntfs_log_trace("Eeek. cnt = %i\n", cnt
);
1035 /* Make sure we are not overflowing the NTFS Unicode set. */
1036 if ((unsigned long)wc
>= (unsigned long)(1 <<
1037 (8 * sizeof(ntfschar
)))) {
1041 /* Convert the CPU wide character to a LE Unicode character. */
1042 ucs
[o
] = cpu_to_le16(wc
);
1045 /* Make sure we are back in the initial state. */
1046 if (!mbsinit(&mbstate
)) {
1047 ntfs_log_trace("Eeek. mbstate not in initial state!\n");
1052 /* Now write the NULL character. */
1053 ucs
[o
] = cpu_to_le16(L
'\0');
1058 #else /* MB_CUR_MAX */
1060 #endif /* MB_CUR_MAX */
1065 * Turn a UTF8 name uppercase
1067 * Returns an allocated uppercase name which has to be freed by caller
1068 * or NULL if there is an error (described by errno)
1071 char *ntfs_uppercase_mbs(const char *low
,
1072 const ntfschar
*upcase
, u32 upcase_size
)
1082 upp
= (char*)ntfs_malloc(3*size
+ 1);
1087 n
= utf8_to_unicode(&wc
, s
);
1089 if (wc
< upcase_size
)
1090 wc
= le16_to_cpu(upcase
[wc
]);
1093 else if (wc
< 0x800) {
1094 *t
++ = (0xc0 | ((wc
>> 6) & 0x3f));
1095 *t
++ = 0x80 | (wc
& 0x3f);
1096 } else if (wc
< 0x10000) {
1097 *t
++ = 0xe0 | (wc
>> 12);
1098 *t
++ = 0x80 | ((wc
>> 6) & 0x3f);
1099 *t
++ = 0x80 | (wc
& 0x3f);
1101 *t
++ = 0xf0 | ((wc
>> 18) & 7);
1102 *t
++ = 0x80 | ((wc
>> 12) & 63);
1103 *t
++ = 0x80 | ((wc
>> 6) & 0x3f);
1104 *t
++ = 0x80 | (wc
& 0x3f);
1120 * ntfs_upcase_table_build - build the default upcase table for NTFS
1121 * @uc: destination buffer where to store the built table
1122 * @uc_len: size of destination buffer in bytes
1124 * ntfs_upcase_table_build() builds the default upcase table for NTFS and
1125 * stores it in the caller supplied buffer @uc of size @uc_len.
1127 * Note, @uc_len must be at least 128kiB in size or bad things will happen!
1129 void ntfs_upcase_table_build(ntfschar
*uc
, u32 uc_len
)
1133 * This is the table as defined by Vista
1136 * "Start" is inclusive and "End" is exclusive, every value has the
1137 * value of "Add" added to it.
1139 static int uc_run_table
[][3] = { /* Start, End, Add */
1140 {0x0061, 0x007b, -32}, {0x00e0, 0x00f7, -32}, {0x00f8, 0x00ff, -32},
1141 {0x0256, 0x0258, -205}, {0x028a, 0x028c, -217}, {0x037b, 0x037e, 130},
1142 {0x03ac, 0x03ad, -38}, {0x03ad, 0x03b0, -37}, {0x03b1, 0x03c2, -32},
1143 {0x03c2, 0x03c3, -31}, {0x03c3, 0x03cc, -32}, {0x03cc, 0x03cd, -64},
1144 {0x03cd, 0x03cf, -63}, {0x0430, 0x0450, -32}, {0x0450, 0x0460, -80},
1145 {0x0561, 0x0587, -48}, {0x1f00, 0x1f08, 8}, {0x1f10, 0x1f16, 8},
1146 {0x1f20, 0x1f28, 8}, {0x1f30, 0x1f38, 8}, {0x1f40, 0x1f46, 8},
1147 {0x1f51, 0x1f52, 8}, {0x1f53, 0x1f54, 8}, {0x1f55, 0x1f56, 8},
1148 {0x1f57, 0x1f58, 8}, {0x1f60, 0x1f68, 8}, {0x1f70, 0x1f72, 74},
1149 {0x1f72, 0x1f76, 86}, {0x1f76, 0x1f78, 100}, {0x1f78, 0x1f7a, 128},
1150 {0x1f7a, 0x1f7c, 112}, {0x1f7c, 0x1f7e, 126}, {0x1f80, 0x1f88, 8},
1151 {0x1f90, 0x1f98, 8}, {0x1fa0, 0x1fa8, 8}, {0x1fb0, 0x1fb2, 8},
1152 {0x1fb3, 0x1fb4, 9}, {0x1fcc, 0x1fcd, -9}, {0x1fd0, 0x1fd2, 8},
1153 {0x1fe0, 0x1fe2, 8}, {0x1fe5, 0x1fe6, 7}, {0x1ffc, 0x1ffd, -9},
1154 {0x2170, 0x2180, -16}, {0x24d0, 0x24ea, -26}, {0x2c30, 0x2c5f, -48},
1155 {0x2d00, 0x2d26, -7264}, {0xff41, 0xff5b, -32}, {0}
1158 * "Start" is exclusive and "End" is inclusive, every second value is
1159 * decremented by one.
1161 static int uc_dup_table
[][2] = { /* Start, End */
1162 {0x0100, 0x012f}, {0x0132, 0x0137}, {0x0139, 0x0149}, {0x014a, 0x0178},
1163 {0x0179, 0x017e}, {0x01a0, 0x01a6}, {0x01b3, 0x01b7}, {0x01cd, 0x01dd},
1164 {0x01de, 0x01ef}, {0x01f4, 0x01f5}, {0x01f8, 0x01f9}, {0x01fa, 0x0220},
1165 {0x0222, 0x0234}, {0x023b, 0x023c}, {0x0241, 0x0242}, {0x0246, 0x024f},
1166 {0x03d8, 0x03ef}, {0x03f7, 0x03f8}, {0x03fa, 0x03fb}, {0x0460, 0x0481},
1167 {0x048a, 0x04bf}, {0x04c1, 0x04c4}, {0x04c5, 0x04c8}, {0x04c9, 0x04ce},
1168 {0x04ec, 0x04ed}, {0x04d0, 0x04eb}, {0x04ee, 0x04f5}, {0x04f6, 0x0513},
1169 {0x1e00, 0x1e95}, {0x1ea0, 0x1ef9}, {0x2183, 0x2184}, {0x2c60, 0x2c61},
1170 {0x2c67, 0x2c6c}, {0x2c75, 0x2c76}, {0x2c80, 0x2ce3}, {0}
1173 * Set the Unicode character at offset "Offset" to "Value". Note,
1174 * "Value" is host endian.
1176 static int uc_byte_table
[][2] = { /* Offset, Value */
1177 {0x00ff, 0x0178}, {0x0180, 0x0243}, {0x0183, 0x0182}, {0x0185, 0x0184},
1178 {0x0188, 0x0187}, {0x018c, 0x018b}, {0x0192, 0x0191}, {0x0195, 0x01f6},
1179 {0x0199, 0x0198}, {0x019a, 0x023d}, {0x019e, 0x0220}, {0x01a8, 0x01a7},
1180 {0x01ad, 0x01ac}, {0x01b0, 0x01af}, {0x01b9, 0x01b8}, {0x01bd, 0x01bc},
1181 {0x01bf, 0x01f7}, {0x01c6, 0x01c4}, {0x01c9, 0x01c7}, {0x01cc, 0x01ca},
1182 {0x01dd, 0x018e}, {0x01f3, 0x01f1}, {0x023a, 0x2c65}, {0x023e, 0x2c66},
1183 {0x0253, 0x0181}, {0x0254, 0x0186}, {0x0259, 0x018f}, {0x025b, 0x0190},
1184 {0x0260, 0x0193}, {0x0263, 0x0194}, {0x0268, 0x0197}, {0x0269, 0x0196},
1185 {0x026b, 0x2c62}, {0x026f, 0x019c}, {0x0272, 0x019d}, {0x0275, 0x019f},
1186 {0x027d, 0x2c64}, {0x0280, 0x01a6}, {0x0283, 0x01a9}, {0x0288, 0x01ae},
1187 {0x0289, 0x0244}, {0x028c, 0x0245}, {0x0292, 0x01b7}, {0x03f2, 0x03f9},
1188 {0x04cf, 0x04c0}, {0x1d7d, 0x2c63}, {0x214e, 0x2132}, {0}
1192 * This is the table as defined by Windows XP
1194 static int uc_run_table
[][3] = { /* Start, End, Add */
1195 {0x0061, 0x007B, -32}, {0x0451, 0x045D, -80}, {0x1F70, 0x1F72, 74},
1196 {0x00E0, 0x00F7, -32}, {0x045E, 0x0460, -80}, {0x1F72, 0x1F76, 86},
1197 {0x00F8, 0x00FF, -32}, {0x0561, 0x0587, -48}, {0x1F76, 0x1F78, 100},
1198 {0x0256, 0x0258, -205}, {0x1F00, 0x1F08, 8}, {0x1F78, 0x1F7A, 128},
1199 {0x028A, 0x028C, -217}, {0x1F10, 0x1F16, 8}, {0x1F7A, 0x1F7C, 112},
1200 {0x03AC, 0x03AD, -38}, {0x1F20, 0x1F28, 8}, {0x1F7C, 0x1F7E, 126},
1201 {0x03AD, 0x03B0, -37}, {0x1F30, 0x1F38, 8}, {0x1FB0, 0x1FB2, 8},
1202 {0x03B1, 0x03C2, -32}, {0x1F40, 0x1F46, 8}, {0x1FD0, 0x1FD2, 8},
1203 {0x03C2, 0x03C3, -31}, {0x1F51, 0x1F52, 8}, {0x1FE0, 0x1FE2, 8},
1204 {0x03C3, 0x03CC, -32}, {0x1F53, 0x1F54, 8}, {0x1FE5, 0x1FE6, 7},
1205 {0x03CC, 0x03CD, -64}, {0x1F55, 0x1F56, 8}, {0x2170, 0x2180, -16},
1206 {0x03CD, 0x03CF, -63}, {0x1F57, 0x1F58, 8}, {0x24D0, 0x24EA, -26},
1207 {0x0430, 0x0450, -32}, {0x1F60, 0x1F68, 8}, {0xFF41, 0xFF5B, -32},
1210 static int uc_dup_table
[][2] = { /* Start, End */
1211 {0x0100, 0x012F}, {0x01A0, 0x01A6}, {0x03E2, 0x03EF}, {0x04CB, 0x04CC},
1212 {0x0132, 0x0137}, {0x01B3, 0x01B7}, {0x0460, 0x0481}, {0x04D0, 0x04EB},
1213 {0x0139, 0x0149}, {0x01CD, 0x01DD}, {0x0490, 0x04BF}, {0x04EE, 0x04F5},
1214 {0x014A, 0x0178}, {0x01DE, 0x01EF}, {0x04BF, 0x04BF}, {0x04F8, 0x04F9},
1215 {0x0179, 0x017E}, {0x01F4, 0x01F5}, {0x04C1, 0x04C4}, {0x1E00, 0x1E95},
1216 {0x018B, 0x018B}, {0x01FA, 0x0218}, {0x04C7, 0x04C8}, {0x1EA0, 0x1EF9},
1219 static int uc_byte_table
[][2] = { /* Offset, Value */
1220 {0x00FF, 0x0178}, {0x01AD, 0x01AC}, {0x01F3, 0x01F1}, {0x0269, 0x0196},
1221 {0x0183, 0x0182}, {0x01B0, 0x01AF}, {0x0253, 0x0181}, {0x026F, 0x019C},
1222 {0x0185, 0x0184}, {0x01B9, 0x01B8}, {0x0254, 0x0186}, {0x0272, 0x019D},
1223 {0x0188, 0x0187}, {0x01BD, 0x01BC}, {0x0259, 0x018F}, {0x0275, 0x019F},
1224 {0x018C, 0x018B}, {0x01C6, 0x01C4}, {0x025B, 0x0190}, {0x0283, 0x01A9},
1225 {0x0192, 0x0191}, {0x01C9, 0x01C7}, {0x0260, 0x0193}, {0x0288, 0x01AE},
1226 {0x0199, 0x0198}, {0x01CC, 0x01CA}, {0x0263, 0x0194}, {0x0292, 0x01B7},
1227 {0x01A8, 0x01A7}, {0x01DD, 0x018E}, {0x0268, 0x0197},
1234 memset((char*)uc
, 0, uc_len
);
1238 for (i
= 0; (u32
)i
< uc_len
; i
++)
1239 uc
[i
] = cpu_to_le16(i
);
1240 for (r
= 0; uc_run_table
[r
][0]; r
++) {
1241 off
= uc_run_table
[r
][2];
1242 for (i
= uc_run_table
[r
][0]; i
< uc_run_table
[r
][1]; i
++)
1243 uc
[i
] = cpu_to_le16(i
+ off
);
1245 for (r
= 0; uc_dup_table
[r
][0]; r
++)
1246 for (i
= uc_dup_table
[r
][0]; i
< uc_dup_table
[r
][1]; i
+= 2)
1247 uc
[i
+ 1] = cpu_to_le16(i
);
1248 for (r
= 0; uc_byte_table
[r
][0]; r
++) {
1249 k
= uc_byte_table
[r
][1];
1250 uc
[uc_byte_table
[r
][0]] = cpu_to_le16(k
);
1255 * Allocate and build the default upcase table
1257 * Returns the number of entries
1261 #define UPCASE_LEN 65536 /* default number of entries in upcase */
1263 u32
ntfs_upcase_build_default(ntfschar
**upcase
)
1267 *upcase
= (ntfschar
*)ntfs_malloc(UPCASE_LEN
*2);
1269 ntfs_upcase_table_build(*upcase
, UPCASE_LEN
*2);
1270 upcase_len
= UPCASE_LEN
;
1272 return (upcase_len
);
1276 * Build a table for converting to lower case
1278 * This is only meaningful when there is a single lower case
1279 * character leading to an upper case one, and currently the
1280 * only exception is the greek letter sigma which has a single
1281 * upper case glyph (code U+03A3), but two lower case glyphs
1282 * (code U+03C3 and U+03C2, the latter to be used at the end
1283 * of a word). In the following implementation the upper case
1284 * sigma will be lowercased as U+03C3.
1287 ntfschar
*ntfs_locase_table_build(const ntfschar
*uc
, u32 uc_cnt
)
1293 lc
= (ntfschar
*)ntfs_malloc(uc_cnt
*sizeof(ntfschar
));
1295 for (i
=0; i
<uc_cnt
; i
++)
1296 lc
[i
] = cpu_to_le16(i
);
1297 for (i
=0; i
<uc_cnt
; i
++) {
1298 upp
= le16_to_cpu(uc
[i
]);
1299 if ((upp
!= i
) && (upp
< uc_cnt
))
1300 lc
[upp
] = cpu_to_le16(i
);
1303 ntfs_log_error("Could not build the locase table\n");
1308 * ntfs_str2ucs - convert a string to a valid NTFS file name
1310 * @len: length of output buffer in Unicode characters
1312 * Convert the input @s string into the corresponding little endian,
1313 * 2-byte Unicode string. The length of the converted string is less
1314 * or equal to the maximum length allowed by the NTFS format (255).
1316 * If @s is NULL then return AT_UNNAMED.
1318 * On success the function returns the Unicode string in an allocated
1319 * buffer and the caller is responsible to free it when it's not needed
1322 * On error NULL is returned and errno is set to the error code.
1324 ntfschar
*ntfs_str2ucs(const char *s
, int *len
)
1326 ntfschar
*ucs
= NULL
;
1328 if (s
&& ((*len
= ntfs_mbstoucs(s
, &ucs
)) == -1)) {
1329 ntfs_log_perror("Couldn't convert '%s' to Unicode", s
);
1332 if (*len
> NTFS_MAX_NAME_LEN
) {
1334 errno
= ENAMETOOLONG
;
1337 if (!ucs
|| !*len
) {
1345 * ntfs_ucsfree - free memory allocated by ntfs_str2ucs()
1346 * @ucs input string to be freed
1348 * Free memory at @ucs and which was allocated by ntfs_str2ucs.
1350 * Return value: none.
1352 void ntfs_ucsfree(ntfschar
*ucs
)
1354 if (ucs
&& (ucs
!= AT_UNNAMED
))
1359 * Check whether a name contains no chars forbidden
1360 * for DOS or Win32 use
1362 * If there is a bad char, errno is set to EINVAL
1365 BOOL
ntfs_forbidden_chars(const ntfschar
*name
, int len
)
1370 u32 mainset
= (1L << ('\"' - 0x20))
1371 | (1L << ('*' - 0x20))
1372 | (1L << ('/' - 0x20))
1373 | (1L << (':' - 0x20))
1374 | (1L << ('<' - 0x20))
1375 | (1L << ('>' - 0x20))
1376 | (1L << ('?' - 0x20));
1378 forbidden
= (len
== 0)
1379 || (le16_to_cpu(name
[len
-1]) == ' ')
1380 || (le16_to_cpu(name
[len
-1]) == '.');
1381 for (i
=0; i
<len
; i
++) {
1382 ch
= le16_to_cpu(name
[i
]);
1385 && ((1L << (ch
- 0x20)) & mainset
))
1396 * Check whether the same name can be used as a DOS and
1399 * The names must be the same, or the short name the uppercase
1400 * variant of the long name
1403 BOOL
ntfs_collapsible_chars(ntfs_volume
*vol
,
1404 const ntfschar
*shortname
, int shortlen
,
1405 const ntfschar
*longname
, int longlen
)
1412 collapsible
= shortlen
== longlen
;
1413 for (i
=0; collapsible
&& (i
<shortlen
); i
++) {
1414 ch
= le16_to_cpu(longname
[i
]);
1415 cs
= le16_to_cpu(shortname
[i
]);
1417 && ((ch
>= vol
->upcase_len
)
1418 || (cs
>= vol
->upcase_len
)
1419 || (vol
->upcase
[cs
] != vol
->upcase
[ch
])))
1420 collapsible
= FALSE
;
1422 return (collapsible
);
1426 * Define the character encoding to be used.
1427 * Use UTF-8 unless specified otherwise.
1430 int ntfs_set_char_encoding(const char *locale
)
1433 if (!locale
|| strstr(locale
,"utf8") || strstr(locale
,"UTF8")
1434 || strstr(locale
,"utf-8") || strstr(locale
,"UTF-8"))
1437 if (setlocale(LC_ALL
, locale
))
1440 ntfs_log_error("Invalid locale, encoding to UTF-8\n");
1443 return 0; /* always successful */
1446 #if defined(__APPLE__) || defined(__DARWIN__)
1448 int ntfs_macosx_normalize_filenames(int normalize
) {
1449 #ifdef ENABLE_NFCONV
1450 if(normalize
== 0 || normalize
== 1) {
1451 nfconvert_utf8
= normalize
;
1458 #endif /* ENABLE_NFCONV */
1461 int ntfs_macosx_normalize_utf8(const char *utf8_string
, char **target
,
1463 #ifdef ENABLE_NFCONV
1464 /* For this code to compile, the CoreFoundation framework must be fed to the linker. */
1465 CFStringRef cfSourceString
;
1466 CFMutableStringRef cfMutableString
;
1467 CFRange rangeToProcess
;
1468 CFIndex requiredBufferLength
;
1469 char *result
= NULL
;
1470 int resultLength
= -1;
1472 /* Convert the UTF-8 string to a CFString. */
1473 cfSourceString
= CFStringCreateWithCString(kCFAllocatorDefault
, utf8_string
, kCFStringEncodingUTF8
);
1474 if(cfSourceString
== NULL
) {
1475 ntfs_log_error("CFStringCreateWithCString failed!\n");
1479 /* Create a mutable string from cfSourceString that we are free to modify. */
1480 cfMutableString
= CFStringCreateMutableCopy(kCFAllocatorDefault
, 0, cfSourceString
);
1481 CFRelease(cfSourceString
); /* End-of-life. */
1482 if(cfMutableString
== NULL
) {
1483 ntfs_log_error("CFStringCreateMutableCopy failed!\n");
1487 /* Normalize the mutable string to the desired normalization form. */
1488 CFStringNormalize(cfMutableString
, (composed
!= 0 ? kCFStringNormalizationFormC
: kCFStringNormalizationFormD
));
1490 /* Store the resulting string in a '\0'-terminated UTF-8 encoded char* buffer. */
1491 rangeToProcess
= CFRangeMake(0, CFStringGetLength(cfMutableString
));
1492 if(CFStringGetBytes(cfMutableString
, rangeToProcess
, kCFStringEncodingUTF8
, 0, false, NULL
, 0, &requiredBufferLength
) > 0) {
1493 resultLength
= sizeof(char)*(requiredBufferLength
+ 1);
1494 result
= ntfs_calloc(resultLength
);
1496 if(result
!= NULL
) {
1497 if(CFStringGetBytes(cfMutableString
, rangeToProcess
, kCFStringEncodingUTF8
,
1498 0, false, (UInt8
*)result
, resultLength
-1, &requiredBufferLength
) <= 0) {
1499 ntfs_log_error("Could not perform UTF-8 conversion of normalized CFMutableString.\n");
1505 ntfs_log_error("Could not perform a ntfs_calloc of %d bytes for char *result.\n", resultLength
);
1508 ntfs_log_error("Could not perform check for required length of UTF-8 conversion of normalized CFMutableString.\n");
1511 CFRelease(cfMutableString
);
1513 if(result
!= NULL
) {
1515 return resultLength
- 1;
1521 #endif /* ENABLE_NFCONV */
1523 #endif /* defined(__APPLE__) || defined(__DARWIN__) */