2 * unistr.c - Unicode string handling. Originated from the Linux-NTFS project.
4 * Copyright (c) 2000-2004 Anton Altaparmakov
5 * Copyright (c) 2002-2009 Szabolcs Szakacsits
6 * Copyright (c) 2008-2015 Jean-Pierre Andre
7 * Copyright (c) 2008 Bernhard Kaindl
9 * This program/include file is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License as published
11 * by the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * This program/include file is distributed in the hope that it will be
15 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
16 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program (in the main directory of the NTFS-3G
21 * distribution in the file COPYING); if not, write to the Free Software
22 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
48 #if defined(__APPLE__) || defined(__DARWIN__)
50 #include <CoreFoundation/CoreFoundation.h>
51 #endif /* ENABLE_NFCONV */
52 #endif /* defined(__APPLE__) || defined(__DARWIN__) */
62 #ifndef ALLOW_BROKEN_UNICODE
63 /* Erik allowing broken UTF-16 surrogate pairs and U+FFFE and U+FFFF by default,
65 #define ALLOW_BROKEN_UNICODE 1
66 #endif /* !defined(ALLOW_BROKEN_UNICODE) */
72 * All these routines assume that the Unicode characters are in little endian
73 * encoding inside the strings!!!
76 static int use_utf8
= 1; /* use UTF-8 encoding for file names */
78 #if defined(__APPLE__) || defined(__DARWIN__)
81 * This variable controls whether or not automatic normalization form conversion
82 * should be performed when translating NTFS unicode file names to UTF-8.
83 * Defaults to on, but can be controlled from the outside using the function
84 * int ntfs_macosx_normalize_filenames(int normalize);
86 static int nfconvert_utf8
= 1;
87 #endif /* ENABLE_NFCONV */
88 #endif /* defined(__APPLE__) || defined(__DARWIN__) */
91 * This is used by the name collation functions to quickly determine what
92 * characters are (in)valid.
95 static const u8 legal_ansi_char_array
[0x40] = {
96 0x00, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
97 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
99 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
100 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
102 0x17, 0x07, 0x18, 0x17, 0x17, 0x17, 0x17, 0x17,
103 0x17, 0x17, 0x18, 0x16, 0x16, 0x17, 0x07, 0x00,
105 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17,
106 0x17, 0x17, 0x04, 0x16, 0x18, 0x16, 0x18, 0x18,
111 * ntfs_names_are_equal - compare two Unicode names for equality
112 * @s1: name to compare to @s2
113 * @s1_len: length in Unicode characters of @s1
114 * @s2: name to compare to @s1
115 * @s2_len: length in Unicode characters of @s2
116 * @ic: ignore case bool
117 * @upcase: upcase table (only if @ic == IGNORE_CASE)
118 * @upcase_size: length in Unicode characters of @upcase (if present)
120 * Compare the names @s1 and @s2 and return TRUE (1) if the names are
121 * identical, or FALSE (0) if they are not identical. If @ic is IGNORE_CASE,
122 * the @upcase table is used to perform a case insensitive comparison.
124 BOOL
ntfs_names_are_equal(const ntfschar
*s1
, size_t s1_len
,
125 const ntfschar
*s2
, size_t s2_len
,
126 const IGNORE_CASE_BOOL ic
,
127 const ntfschar
*upcase
, const u32 upcase_size
)
129 if (s1_len
!= s2_len
)
133 if (ic
== CASE_SENSITIVE
)
134 return ntfs_ucsncmp(s1
, s2
, s1_len
) ? FALSE
: TRUE
;
135 return ntfs_ucsncasecmp(s1
, s2
, s1_len
, upcase
, upcase_size
) ? FALSE
:
140 * ntfs_names_full_collate() fully collate two Unicode names
142 * @name1: first Unicode name to compare
143 * @name1_len: length of first Unicode name to compare
144 * @name2: second Unicode name to compare
145 * @name2_len: length of second Unicode name to compare
146 * @ic: either CASE_SENSITIVE or IGNORE_CASE (see below)
147 * @upcase: upcase table
148 * @upcase_len: upcase table size
150 * If @ic is CASE_SENSITIVE, then the names are compared primarily ignoring
151 * case, but if the names are equal ignoring case, then they are compared
152 * case-sensitively. As an example, "abc" would collate before "BCD" (since
153 * "abc" and "BCD" differ ignoring case and 'A' < 'B') but after "ABC" (since
154 * "ABC" and "abc" are equal ignoring case and 'A' < 'a'). This matches the
155 * collation order of filenames as indexed in NTFS directories.
157 * If @ic is IGNORE_CASE, then the names are only compared case-insensitively
158 * and are considered to match if and only if they are equal ignoring case.
161 * -1 if the first name collates before the second one,
162 * 0 if the names match, or
163 * 1 if the second name collates before the first one
165 int ntfs_names_full_collate(const ntfschar
*name1
, const u32 name1_len
,
166 const ntfschar
*name2
, const u32 name2_len
,
167 const IGNORE_CASE_BOOL ic
, const ntfschar
*upcase
,
168 const u32 upcase_len
)
175 if (!name1
|| !name2
|| !upcase
|| !upcase_len
) {
176 ntfs_log_debug("ntfs_names_collate received NULL pointer!\n");
180 cnt
= min(name1_len
, name2_len
);
182 if (ic
== CASE_SENSITIVE
) {
183 while (--cnt
&& (*name1
== *name2
)) {
187 u1
= c1
= le16_to_cpu(*name1
);
188 u2
= c2
= le16_to_cpu(*name2
);
190 u1
= le16_to_cpu(upcase
[u1
]);
192 u2
= le16_to_cpu(upcase
[u2
]);
193 if ((u1
== u2
) && cnt
)
196 u1
= le16_to_cpu(*name1
);
198 u2
= le16_to_cpu(*name2
);
200 u1
= le16_to_cpu(upcase
[u1
]);
202 u2
= le16_to_cpu(upcase
[u2
]);
203 } while ((u1
== u2
) && --cnt
);
208 if (name1_len
< name2_len
)
210 if (name1_len
> name2_len
)
218 u1
= le16_to_cpu(*name1
);
220 u2
= le16_to_cpu(*name2
);
223 u1
= le16_to_cpu(upcase
[u1
]);
225 u2
= le16_to_cpu(upcase
[u2
]);
226 } while ((u1
== u2
) && --cnt
);
231 if (name1_len
< name2_len
)
233 if (name1_len
> name2_len
)
237 if (name1_len
< name2_len
)
239 if (name1_len
> name2_len
)
246 * ntfs_ucsncmp - compare two little endian Unicode strings
249 * @n: maximum unicode characters to compare
251 * Compare the first @n characters of the Unicode strings @s1 and @s2,
252 * The strings in little endian format and appropriate le16_to_cpu()
253 * conversion is performed on non-little endian machines.
255 * The function returns an integer less than, equal to, or greater than zero
256 * if @s1 (or the first @n Unicode characters thereof) is found, respectively,
257 * to be less than, to match, or be greater than @s2.
259 int ntfs_ucsncmp(const ntfschar
*s1
, const ntfschar
*s2
, size_t n
)
266 ntfs_log_debug("ntfs_wcsncmp() received NULL pointer!\n");
270 for (i
= 0; i
< n
; ++i
) {
271 c1
= le16_to_cpu(s1
[i
]);
272 c2
= le16_to_cpu(s2
[i
]);
284 * ntfs_ucsncasecmp - compare two little endian Unicode strings, ignoring case
287 * @n: maximum unicode characters to compare
288 * @upcase: upcase table
289 * @upcase_size: upcase table size in Unicode characters
291 * Compare the first @n characters of the Unicode strings @s1 and @s2,
292 * ignoring case. The strings in little endian format and appropriate
293 * le16_to_cpu() conversion is performed on non-little endian machines.
295 * Each character is uppercased using the @upcase table before the comparison.
297 * The function returns an integer less than, equal to, or greater than zero
298 * if @s1 (or the first @n Unicode characters thereof) is found, respectively,
299 * to be less than, to match, or be greater than @s2.
301 int ntfs_ucsncasecmp(const ntfschar
*s1
, const ntfschar
*s2
, size_t n
,
302 const ntfschar
*upcase
, const u32 upcase_size
)
308 if (!s1
|| !s2
|| !upcase
) {
309 ntfs_log_debug("ntfs_wcsncasecmp() received NULL pointer!\n");
313 for (i
= 0; i
< n
; ++i
) {
314 if ((c1
= le16_to_cpu(s1
[i
])) < upcase_size
)
315 c1
= le16_to_cpu(upcase
[c1
]);
316 if ((c2
= le16_to_cpu(s2
[i
])) < upcase_size
)
317 c2
= le16_to_cpu(upcase
[c2
]);
329 * ntfs_ucsnlen - determine the length of a little endian Unicode string
330 * @s: pointer to Unicode string
331 * @maxlen: maximum length of string @s
333 * Return the number of Unicode characters in the little endian Unicode
334 * string @s up to a maximum of maxlen Unicode characters, not including
335 * the terminating (ntfschar)'\0'. If there is no (ntfschar)'\0' between @s
336 * and @s + @maxlen, @maxlen is returned.
338 * This function never looks beyond @s + @maxlen.
340 u32
ntfs_ucsnlen(const ntfschar
*s
, u32 maxlen
)
344 for (i
= 0; i
< maxlen
; i
++) {
345 if (!le16_to_cpu(s
[i
]))
352 * ntfs_ucsndup - duplicate little endian Unicode string
353 * @s: pointer to Unicode string
354 * @maxlen: maximum length of string @s
356 * Return a pointer to a new little endian Unicode string which is a duplicate
357 * of the string s. Memory for the new string is obtained with ntfs_malloc(3),
358 * and can be freed with free(3).
360 * A maximum of @maxlen Unicode characters are copied and a terminating
361 * (ntfschar)'\0' little endian Unicode character is added.
363 * This function never looks beyond @s + @maxlen.
365 * Return a pointer to the new little endian Unicode string on success and NULL
366 * on failure with errno set to the error code.
368 ntfschar
*ntfs_ucsndup(const ntfschar
*s
, u32 maxlen
)
373 len
= ntfs_ucsnlen(s
, maxlen
);
374 dst
= ntfs_malloc((len
+ 1) * sizeof(ntfschar
));
376 memcpy(dst
, s
, len
* sizeof(ntfschar
));
377 dst
[len
] = const_cpu_to_le16(L
'\0');
383 * ntfs_name_upcase - Map an Unicode name to its uppercase equivalent
393 void ntfs_name_upcase(ntfschar
*name
, u32 name_len
, const ntfschar
*upcase
,
394 const u32 upcase_len
)
399 for (i
= 0; i
< name_len
; i
++)
400 if ((u
= le16_to_cpu(name
[i
])) < upcase_len
)
405 * ntfs_name_locase - Map a Unicode name to its lowercase equivalent
407 void ntfs_name_locase(ntfschar
*name
, u32 name_len
, const ntfschar
*locase
,
408 const u32 locase_len
)
414 for (i
= 0; i
< name_len
; i
++)
415 if ((u
= le16_to_cpu(name
[i
])) < locase_len
)
420 * ntfs_file_value_upcase - Convert a filename to upper case
429 void ntfs_file_value_upcase(FILE_NAME_ATTR
*file_name_attr
,
430 const ntfschar
*upcase
, const u32 upcase_len
)
432 ntfs_name_upcase((ntfschar
*)&file_name_attr
->file_name
,
433 file_name_attr
->file_name_length
, upcase
, upcase_len
);
437 NTFS uses Unicode (UTF-16LE [NTFS-3G uses UCS-2LE, which is enough
438 for now]) for path names, but the Unicode code points need to be
439 converted before a path can be accessed under NTFS. For 7 bit ASCII/ANSI,
440 glibc does this even without a locale in a hard-coded fashion as that
441 appears to be is easy because the low 7-bit ASCII range appears to be
442 available in all charsets but it does not convert anything if
443 there was some error with the locale setup or none set up like
444 when mount is called during early boot where he (by policy) do
445 not use locales (and may be not available if /usr is not yet mounted),
446 so this patch fixes the resulting issues for systems which use
447 UTF-8 and for others, specifying the locale in fstab brings them
448 the encoding which they want.
450 If no locale is defined or there was a problem with setting one
451 up and whenever nl_langinfo(CODESET) returns a sting starting with
452 "ANSI", use an internal UCS-2LE <-> UTF-8 codeset converter to fix
453 the bug where NTFS-3G does not show any path names which include
454 international characters!!! (and also fails on creating them) as result.
456 Author: Bernhard Kaindl <bk@suse.de>
457 Jean-Pierre Andre made it compliant with RFC3629/RFC2781.
461 * Return the number of bytes in UTF-8 needed (without the terminating null) to
462 * store the given UTF-16LE string.
464 * On error, -1 is returned, and errno is set to the error code. The following
465 * error codes can be expected:
466 * EILSEQ The input string is not valid UTF-16LE (only possible
467 * if compiled without ALLOW_BROKEN_UNICODE).
468 * ENAMETOOLONG The length of the UTF-8 string in bytes (without the
469 * terminating null) would exceed @outs_len.
471 static int utf16_to_utf8_size(const ntfschar
*ins
, const int ins_len
, int outs_len
)
478 for (i
= 0; i
< ins_len
&& ins
[i
] && count
<= outs_len
; i
++) {
479 unsigned short c
= le16_to_cpu(ins
[i
]);
481 if ((c
>= 0xdc00) && (c
< 0xe000)) {
485 #if ALLOW_BROKEN_UNICODE
486 /* The first UTF-16 unit of a surrogate pair has
487 * a value between 0xd800 and 0xdc00. It can be
488 * encoded as an individual UTF-8 sequence if we
489 * cannot combine it with the next UTF-16 unit
490 * unit as a surrogate pair. */
498 #endif /* ALLOW_BROKEN_UNICODE */
509 #if ALLOW_BROKEN_UNICODE
512 else if (c
>= 0xe000)
514 else if ((c
>= 0xe000) && (c
< 0xfffe))
515 #endif /* ALLOW_BROKEN_UNICODE */
521 if (surrog
&& count
<= outs_len
) {
522 #if ALLOW_BROKEN_UNICODE
523 count
+= 3; /* ending with a single surrogate */
526 #endif /* ALLOW_BROKEN_UNICODE */
529 if (count
> outs_len
) {
530 errno
= ENAMETOOLONG
;
543 * ntfs_utf16_to_utf8 - convert a little endian UTF16LE string to an UTF-8 string
544 * @ins: input utf16 string buffer
545 * @ins_len: length of input string in utf16 characters
546 * @outs: on return contains the (allocated) output multibyte string
547 * @outs_len: length of output buffer in bytes (ignored if *@outs is NULL)
549 * Return -1 with errno set if string has invalid byte sequence or too long.
551 static int ntfs_utf16_to_utf8(const ntfschar
*ins
, const int ins_len
,
552 char **outs
, int outs_len
)
554 #if defined(__APPLE__) || defined(__DARWIN__)
556 char *original_outs_value
= *outs
;
557 int original_outs_len
= outs_len
;
558 #endif /* ENABLE_NFCONV */
559 #endif /* defined(__APPLE__) || defined(__DARWIN__) */
562 int i
, size
, ret
= -1;
567 /* If no output buffer was provided, we will allocate one and
568 * limit its length to PATH_MAX. Note: we follow the standard
569 * convention of PATH_MAX including the terminating null. */
573 /* The size *with* the terminating null is limited to @outs_len,
574 * so the size *without* the terminating null is limited to one less. */
575 size
= utf16_to_utf8_size(ins
, ins_len
, outs_len
- 1);
582 *outs
= ntfs_malloc(outs_len
);
589 for (i
= 0; i
< ins_len
&& ins
[i
]; i
++) {
590 unsigned short c
= le16_to_cpu(ins
[i
]);
591 /* size not double-checked */
593 if ((c
>= 0xdc00) && (c
< 0xe000)) {
594 *t
++ = 0xf0 + (((halfpair
+ 64) >> 8) & 7);
595 *t
++ = 0x80 + (((halfpair
+ 64) >> 2) & 63);
596 *t
++ = 0x80 + ((c
>> 6) & 15) + ((halfpair
& 3) << 4);
597 *t
++ = 0x80 + (c
& 63);
600 #if ALLOW_BROKEN_UNICODE
601 /* The first UTF-16 unit of a surrogate pair has
602 * a value between 0xd800 and 0xdc00. It can be
603 * encoded as an individual UTF-8 sequence if we
604 * cannot combine it with the next UTF-16 unit
605 * unit as a surrogate pair. */
606 *t
++ = 0xe0 | (halfpair
>> 12);
607 *t
++ = 0x80 | ((halfpair
>> 6) & 0x3f);
608 *t
++ = 0x80 | (halfpair
& 0x3f);
615 #endif /* ALLOW_BROKEN_UNICODE */
617 } else if (c
< 0x80) {
621 *t
++ = (0xc0 | ((c
>> 6) & 0x3f));
622 *t
++ = 0x80 | (c
& 0x3f);
623 } else if (c
< 0xd800) {
624 *t
++ = 0xe0 | (c
>> 12);
625 *t
++ = 0x80 | ((c
>> 6) & 0x3f);
626 *t
++ = 0x80 | (c
& 0x3f);
627 } else if (c
< 0xdc00)
629 #if ALLOW_BROKEN_UNICODE
630 else if (c
< 0xe000) {
631 *t
++ = 0xe0 | (c
>> 12);
632 *t
++ = 0x80 | ((c
>> 6) & 0x3f);
633 *t
++ = 0x80 | (c
& 0x3f);
635 #endif /* ALLOW_BROKEN_UNICODE */
636 else if (c
>= 0xe000) {
637 *t
++ = 0xe0 | (c
>> 12);
638 *t
++ = 0x80 | ((c
>> 6) & 0x3f);
639 *t
++ = 0x80 | (c
& 0x3f);
644 #if ALLOW_BROKEN_UNICODE
645 if (halfpair
) { /* ending with a single surrogate */
646 *t
++ = 0xe0 | (halfpair
>> 12);
647 *t
++ = 0x80 | ((halfpair
>> 6) & 0x3f);
648 *t
++ = 0x80 | (halfpair
& 0x3f);
650 #endif /* ALLOW_BROKEN_UNICODE */
653 #if defined(__APPLE__) || defined(__DARWIN__)
655 if(nfconvert_utf8
&& (t
- *outs
) > 0) {
656 char *new_outs
= NULL
;
657 int new_outs_len
= ntfs_macosx_normalize_utf8(*outs
, &new_outs
, 0); // Normalize to decomposed form
658 if(new_outs_len
>= 0 && new_outs
!= NULL
) {
659 if(original_outs_value
!= *outs
) {
660 // We have allocated outs ourselves.
663 t
= *outs
+ new_outs_len
;
666 // We need to copy new_outs into the fixed outs buffer.
667 memset(*outs
, 0, original_outs_len
);
668 strncpy(*outs
, new_outs
, original_outs_len
-1);
669 t
= *outs
+ original_outs_len
;
674 ntfs_log_error("Failed to normalize NTFS string to UTF-8 NFD: %s\n", *outs
);
675 ntfs_log_error(" new_outs=0x%p\n", new_outs
);
676 ntfs_log_error(" new_outs_len=%d\n", new_outs_len
);
679 #endif /* ENABLE_NFCONV */
680 #endif /* defined(__APPLE__) || defined(__DARWIN__) */
691 * Return the amount of 16-bit elements in UTF-16LE needed
692 * (without the terminating null) to store given UTF-8 string.
694 * Return -1 with errno set if it's longer than PATH_MAX or string is invalid.
696 * Note: This does not check whether the input sequence is a valid utf8 string,
697 * and should be used only in context where such check is made!
699 static int utf8_to_utf16_size(const char *s
)
705 while ((byte
= *((const unsigned char *)s
++))) {
706 if (++count
>= PATH_MAX
)
725 if (++count
>= PATH_MAX
)
734 errno
= ENAMETOOLONG
;
738 * This converts one UTF-8 sequence to cpu-endian Unicode value
739 * within range U+0 .. U+10ffff and excluding U+D800 .. U+DFFF
741 * Return the number of used utf8 bytes or -1 with errno set
742 * if sequence is invalid.
744 static int utf8_to_unicode(u32
*wc
, const char *s
)
746 unsigned int byte
= *((const unsigned char *)s
);
752 } else if (byte
< 0x80) {
756 } else if (byte
< 0xc2) {
758 } else if (byte
< 0xE0) {
759 if ((s
[1] & 0xC0) == 0x80) {
760 *wc
= ((u32
)(byte
& 0x1F) << 6)
761 | ((u32
)(s
[1] & 0x3F));
766 } else if (byte
< 0xF0) {
767 if (((s
[1] & 0xC0) == 0x80) && ((s
[2] & 0xC0) == 0x80)) {
768 *wc
= ((u32
)(byte
& 0x0F) << 12)
769 | ((u32
)(s
[1] & 0x3F) << 6)
770 | ((u32
)(s
[2] & 0x3F));
771 /* Check valid ranges */
772 #if ALLOW_BROKEN_UNICODE
773 if (((*wc
>= 0x800) && (*wc
<= 0xD7FF))
774 || ((*wc
>= 0xD800) && (*wc
<= 0xDFFF))
775 || ((*wc
>= 0xe000) && (*wc
<= 0xFFFF)))
778 if (((*wc
>= 0x800) && (*wc
<= 0xD7FF))
779 || ((*wc
>= 0xe000) && (*wc
<= 0xFFFD)))
781 #endif /* ALLOW_BROKEN_UNICODE */
785 } else if (byte
< 0xF5) {
786 if (((s
[1] & 0xC0) == 0x80) && ((s
[2] & 0xC0) == 0x80)
787 && ((s
[3] & 0xC0) == 0x80)) {
788 *wc
= ((u32
)(byte
& 0x07) << 18)
789 | ((u32
)(s
[1] & 0x3F) << 12)
790 | ((u32
)(s
[2] & 0x3F) << 6)
791 | ((u32
)(s
[3] & 0x3F));
792 /* Check valid ranges */
793 if ((*wc
<= 0x10ffff) && (*wc
>= 0x10000))
804 * ntfs_utf8_to_utf16 - convert a UTF-8 string to a UTF-16LE string
805 * @ins: input multibyte string buffer
806 * @outs: on return contains the (allocated) output utf16 string
807 * @outs_len: length of output buffer in utf16 characters
809 * Return -1 with errno set.
811 static int ntfs_utf8_to_utf16(const char *ins
, ntfschar
**outs
)
813 #if defined(__APPLE__) || defined(__DARWIN__)
815 char *new_ins
= NULL
;
818 new_ins_len
= ntfs_macosx_normalize_utf8(ins
, &new_ins
, 1); // Normalize to composed form
822 ntfs_log_error("Failed to normalize NTFS string to UTF-8 NFC: %s\n", ins
);
824 #endif /* ENABLE_NFCONV */
825 #endif /* defined(__APPLE__) || defined(__DARWIN__) */
830 int shorts
, ret
= -1;
832 shorts
= utf8_to_utf16_size(ins
);
838 *outs
= ntfs_malloc((shorts
+ 1) * sizeof(ntfschar
));
847 int m
= utf8_to_unicode(&wc
, t
);
850 /* do not leave space allocated if failed */
853 *outs
= (ntfschar
*)NULL
;
857 *outpos
++ = const_cpu_to_le16(0);
861 *outpos
++ = cpu_to_le16(wc
);
864 *outpos
++ = cpu_to_le16((wc
>> 10) + 0xd800);
865 *outpos
++ = cpu_to_le16((wc
& 0x3ff) + 0xdc00);
870 ret
= --outpos
- *outs
;
872 #if defined(__APPLE__) || defined(__DARWIN__)
876 #endif /* ENABLE_NFCONV */
877 #endif /* defined(__APPLE__) || defined(__DARWIN__) */
882 * ntfs_ucstombs - convert a little endian Unicode string to a multibyte string
883 * @ins: input Unicode string buffer
884 * @ins_len: length of input string in Unicode characters
885 * @outs: on return contains the (allocated) output multibyte string
886 * @outs_len: length of output buffer in bytes (ignored if *@outs is NULL)
888 * Convert the input little endian, 2-byte Unicode string @ins, of length
889 * @ins_len into the multibyte string format dictated by the current locale.
891 * If *@outs is NULL, the function allocates the string and the caller is
892 * responsible for calling free(*@outs); when finished with it.
894 * On success the function returns the number of bytes written to the output
895 * string *@outs (>= 0), not counting the terminating NULL byte. If the output
896 * string buffer was allocated, *@outs is set to it.
898 * On error, -1 is returned, and errno is set to the error code. The following
899 * error codes can be expected:
900 * EINVAL Invalid arguments (e.g. @ins or @outs is NULL).
901 * EILSEQ The input string cannot be represented as a multibyte
902 * sequence according to the current locale.
903 * ENAMETOOLONG Destination buffer is too small for input string.
904 * ENOMEM Not enough memory to allocate destination buffer.
906 int ntfs_ucstombs(const ntfschar
*ins
, const int ins_len
, char **outs
,
918 #endif /* MB_CUR_MAX */
926 if (mbs
&& !mbs_len
) {
927 errno
= ENAMETOOLONG
;
931 return ntfs_utf16_to_utf8(ins
, ins_len
, outs
, outs_len
);
934 mbs_len
= (ins_len
+ 1) * MB_CUR_MAX
;
935 mbs
= ntfs_malloc(mbs_len
);
940 memset(&mbstate
, 0, sizeof(mbstate
));
944 for (i
= o
= 0; i
< ins_len
; i
++) {
945 /* Reallocate memory if necessary or abort. */
946 if ((int)(o
+ MB_CUR_MAX
) > mbs_len
) {
949 errno
= ENAMETOOLONG
;
952 tc
= ntfs_malloc((mbs_len
+ 64) & ~63);
955 memcpy(tc
, mbs
, mbs_len
);
956 mbs_len
= (mbs_len
+ 64) & ~63;
960 /* Convert the LE Unicode character to a CPU wide character. */
961 wc
= (wchar_t)le16_to_cpu(ins
[i
]);
964 /* Convert the CPU endian wide character to multibyte. */
966 cnt
= wcrtomb(mbs
+ o
, wc
, &mbstate
);
968 cnt
= wctomb(mbs
+ o
, wc
);
973 ntfs_log_debug("Eeek. cnt <= 0, cnt = %i\n", cnt
);
980 /* Make sure we are back in the initial state. */
981 if (!mbsinit(&mbstate
)) {
982 ntfs_log_debug("Eeek. mbstate not in initial state!\n");
987 /* Now write the NULL character. */
998 #else /* MB_CUR_MAX */
1000 #endif /* MB_CUR_MAX */
1005 * ntfs_mbstoucs - convert a multibyte string to a little endian Unicode string
1006 * @ins: input multibyte string buffer
1007 * @outs: on return contains the (allocated) output Unicode string
1009 * Convert the input multibyte string @ins, from the current locale into the
1010 * corresponding little endian, 2-byte Unicode string.
1012 * The function allocates the string and the caller is responsible for calling
1013 * free(*@outs); when finished with it.
1015 * On success the function returns the number of Unicode characters written to
1016 * the output string *@outs (>= 0), not counting the terminating Unicode NULL
1019 * On error, -1 is returned, and errno is set to the error code. The following
1020 * error codes can be expected:
1021 * EINVAL Invalid arguments (e.g. @ins or @outs is NULL).
1022 * EILSEQ The input string cannot be represented as a Unicode
1023 * string according to the current locale.
1024 * ENAMETOOLONG Destination buffer is too small for input string.
1025 * ENOMEM Not enough memory to allocate destination buffer.
1027 int ntfs_mbstoucs(const char *ins
, ntfschar
**outs
)
1033 int i
, o
, cnt
, ins_len
, ucs_len
, ins_size
;
1037 #endif /* MB_CUR_MAX */
1039 if (!ins
|| !outs
) {
1045 return ntfs_utf8_to_utf16(ins
, outs
);
1048 /* Determine the size of the multi-byte string in bytes. */
1049 ins_size
= strlen(ins
);
1050 /* Determine the length of the multi-byte string. */
1052 #if defined(HAVE_MBSINIT)
1053 memset(&mbstate
, 0, sizeof(mbstate
));
1054 ins_len
= mbsrtowcs(NULL
, (const char **)&s
, 0, &mbstate
);
1056 if (!ins_len
&& *ins
) {
1057 /* Older Cygwin had broken mbsrtowcs() implementation. */
1058 ins_len
= strlen(ins
);
1061 #elif !defined(DJGPP)
1062 ins_len
= mbstowcs(NULL
, s
, 0);
1064 /* Eeek!!! DJGPP has broken mbstowcs() implementation!!! */
1065 ins_len
= strlen(ins
);
1070 if ((s
!= ins
) || !mbsinit(&mbstate
)) {
1077 /* Add the NULL terminator. */
1080 ucs
= ntfs_malloc(ucs_len
* sizeof(ntfschar
));
1084 memset(&mbstate
, 0, sizeof(mbstate
));
1086 mbtowc(NULL
, NULL
, 0);
1088 for (i
= o
= cnt
= 0; i
< ins_size
; i
+= cnt
, o
++) {
1089 /* Reallocate memory if necessary. */
1092 ucs_len
= (ucs_len
* sizeof(ntfschar
) + 64) & ~63;
1093 tc
= realloc(ucs
, ucs_len
);
1097 ucs_len
/= sizeof(ntfschar
);
1099 /* Convert the multibyte character to a wide character. */
1101 cnt
= mbrtowc(&wc
, ins
+ i
, ins_size
- i
, &mbstate
);
1103 cnt
= mbtowc(&wc
, ins
+ i
, ins_size
- i
);
1110 ntfs_log_trace("Eeek. cnt = %i\n", cnt
);
1114 /* Make sure we are not overflowing the NTFS Unicode set. */
1115 if ((unsigned long)wc
>= (unsigned long)(1 <<
1116 (8 * sizeof(ntfschar
)))) {
1120 /* Convert the CPU wide character to a LE Unicode character. */
1121 ucs
[o
] = cpu_to_le16(wc
);
1124 /* Make sure we are back in the initial state. */
1125 if (!mbsinit(&mbstate
)) {
1126 ntfs_log_trace("Eeek. mbstate not in initial state!\n");
1131 /* Now write the NULL character. */
1132 ucs
[o
] = const_cpu_to_le16(L
'\0');
1137 #else /* MB_CUR_MAX */
1139 #endif /* MB_CUR_MAX */
1144 * Turn a UTF8 name uppercase
1146 * Returns an allocated uppercase name which has to be freed by caller
1147 * or NULL if there is an error (described by errno)
1150 char *ntfs_uppercase_mbs(const char *low
,
1151 const ntfschar
*upcase
, u32 upcase_size
)
1161 upp
= (char*)ntfs_malloc(3*size
+ 1);
1166 n
= utf8_to_unicode(&wc
, s
);
1168 if (wc
< upcase_size
)
1169 wc
= le16_to_cpu(upcase
[wc
]);
1172 else if (wc
< 0x800) {
1173 *t
++ = (0xc0 | ((wc
>> 6) & 0x3f));
1174 *t
++ = 0x80 | (wc
& 0x3f);
1175 } else if (wc
< 0x10000) {
1176 *t
++ = 0xe0 | (wc
>> 12);
1177 *t
++ = 0x80 | ((wc
>> 6) & 0x3f);
1178 *t
++ = 0x80 | (wc
& 0x3f);
1180 *t
++ = 0xf0 | ((wc
>> 18) & 7);
1181 *t
++ = 0x80 | ((wc
>> 12) & 63);
1182 *t
++ = 0x80 | ((wc
>> 6) & 0x3f);
1183 *t
++ = 0x80 | (wc
& 0x3f);
1199 * ntfs_upcase_table_build - build the default upcase table for NTFS
1200 * @uc: destination buffer where to store the built table
1201 * @uc_len: size of destination buffer in bytes
1203 * ntfs_upcase_table_build() builds the default upcase table for NTFS and
1204 * stores it in the caller supplied buffer @uc of size @uc_len.
1206 * Note, @uc_len must be at least 128kiB in size or bad things will happen!
1208 void ntfs_upcase_table_build(ntfschar
*uc
, u32 uc_len
)
1210 struct NEWUPPERCASE
{
1211 unsigned short first
;
1212 unsigned short last
;
1215 unsigned char osmajor
;
1216 unsigned char osminor
;
1220 * This is the table as defined by Windows XP
1222 static int uc_run_table
[][3] = { /* Start, End, Add */
1223 {0x0061, 0x007B, -32}, {0x0451, 0x045D, -80}, {0x1F70, 0x1F72, 74},
1224 {0x00E0, 0x00F7, -32}, {0x045E, 0x0460, -80}, {0x1F72, 0x1F76, 86},
1225 {0x00F8, 0x00FF, -32}, {0x0561, 0x0587, -48}, {0x1F76, 0x1F78, 100},
1226 {0x0256, 0x0258, -205}, {0x1F00, 0x1F08, 8}, {0x1F78, 0x1F7A, 128},
1227 {0x028A, 0x028C, -217}, {0x1F10, 0x1F16, 8}, {0x1F7A, 0x1F7C, 112},
1228 {0x03AC, 0x03AD, -38}, {0x1F20, 0x1F28, 8}, {0x1F7C, 0x1F7E, 126},
1229 {0x03AD, 0x03B0, -37}, {0x1F30, 0x1F38, 8}, {0x1FB0, 0x1FB2, 8},
1230 {0x03B1, 0x03C2, -32}, {0x1F40, 0x1F46, 8}, {0x1FD0, 0x1FD2, 8},
1231 {0x03C2, 0x03C3, -31}, {0x1F51, 0x1F52, 8}, {0x1FE0, 0x1FE2, 8},
1232 {0x03C3, 0x03CC, -32}, {0x1F53, 0x1F54, 8}, {0x1FE5, 0x1FE6, 7},
1233 {0x03CC, 0x03CD, -64}, {0x1F55, 0x1F56, 8}, {0x2170, 0x2180, -16},
1234 {0x03CD, 0x03CF, -63}, {0x1F57, 0x1F58, 8}, {0x24D0, 0x24EA, -26},
1235 {0x0430, 0x0450, -32}, {0x1F60, 0x1F68, 8}, {0xFF41, 0xFF5B, -32},
1238 static int uc_dup_table
[][2] = { /* Start, End */
1239 {0x0100, 0x012F}, {0x01A0, 0x01A6}, {0x03E2, 0x03EF}, {0x04CB, 0x04CC},
1240 {0x0132, 0x0137}, {0x01B3, 0x01B7}, {0x0460, 0x0481}, {0x04D0, 0x04EB},
1241 {0x0139, 0x0149}, {0x01CD, 0x01DD}, {0x0490, 0x04BF}, {0x04EE, 0x04F5},
1242 {0x014A, 0x0178}, {0x01DE, 0x01EF}, {0x04BF, 0x04BF}, {0x04F8, 0x04F9},
1243 {0x0179, 0x017E}, {0x01F4, 0x01F5}, {0x04C1, 0x04C4}, {0x1E00, 0x1E95},
1244 {0x018B, 0x018B}, {0x01FA, 0x0218}, {0x04C7, 0x04C8}, {0x1EA0, 0x1EF9},
1247 static int uc_byte_table
[][2] = { /* Offset, Value */
1248 {0x00FF, 0x0178}, {0x01AD, 0x01AC}, {0x01F3, 0x01F1}, {0x0269, 0x0196},
1249 {0x0183, 0x0182}, {0x01B0, 0x01AF}, {0x0253, 0x0181}, {0x026F, 0x019C},
1250 {0x0185, 0x0184}, {0x01B9, 0x01B8}, {0x0254, 0x0186}, {0x0272, 0x019D},
1251 {0x0188, 0x0187}, {0x01BD, 0x01BC}, {0x0259, 0x018F}, {0x0275, 0x019F},
1252 {0x018C, 0x018B}, {0x01C6, 0x01C4}, {0x025B, 0x0190}, {0x0283, 0x01A9},
1253 {0x0192, 0x0191}, {0x01C9, 0x01C7}, {0x0260, 0x0193}, {0x0288, 0x01AE},
1254 {0x0199, 0x0198}, {0x01CC, 0x01CA}, {0x0263, 0x0194}, {0x0292, 0x01B7},
1255 {0x01A8, 0x01A7}, {0x01DD, 0x018E}, {0x0268, 0x0197},
1260 * Changes which were applied to later Windows versions
1262 * md5 for $UpCase from Winxp : 6fa3db2468275286210751e869d36373
1263 * Vista : 2f03b5a69d486ff3864cecbd07f24440
1264 * Win8 : 7ff498a44e45e77374cc7c962b1b92f2
1266 static const struct NEWUPPERCASE newuppercase
[] = {
1267 /* from Windows 6.0 (Vista) */
1268 { 0x37b, 0x37d, 0x82, 1, 6, 0 },
1269 { 0x1f80, 0x1f87, 0x8, 1, 6, 0 },
1270 { 0x1f90, 0x1f97, 0x8, 1, 6, 0 },
1271 { 0x1fa0, 0x1fa7, 0x8, 1, 6, 0 },
1272 { 0x2c30, 0x2c5e, -0x30, 1, 6, 0 },
1273 { 0x2d00, 0x2d25, -0x1c60, 1, 6, 0 },
1274 { 0x2c68, 0x2c6c, -0x1, 2, 6, 0 },
1275 { 0x219, 0x21f, -0x1, 2, 6, 0 },
1276 { 0x223, 0x233, -0x1, 2, 6, 0 },
1277 { 0x247, 0x24f, -0x1, 2, 6, 0 },
1278 { 0x3d9, 0x3e1, -0x1, 2, 6, 0 },
1279 { 0x48b, 0x48f, -0x1, 2, 6, 0 },
1280 { 0x4fb, 0x513, -0x1, 2, 6, 0 },
1281 { 0x2c81, 0x2ce3, -0x1, 2, 6, 0 },
1282 { 0x3f8, 0x3fb, -0x1, 3, 6, 0 },
1283 { 0x4c6, 0x4ce, -0x1, 4, 6, 0 },
1284 { 0x23c, 0x242, -0x1, 6, 6, 0 },
1285 { 0x4ed, 0x4f7, -0x1, 10, 6, 0 },
1286 { 0x450, 0x45d, -0x50, 13, 6, 0 },
1287 { 0x2c61, 0x2c76, -0x1, 21, 6, 0 },
1288 { 0x1fcc, 0x1ffc, -0x9, 48, 6, 0 },
1289 { 0x180, 0x180, 0xc3, 1, 6, 0 },
1290 { 0x195, 0x195, 0x61, 1, 6, 0 },
1291 { 0x19a, 0x19a, 0xa3, 1, 6, 0 },
1292 { 0x19e, 0x19e, 0x82, 1, 6, 0 },
1293 { 0x1bf, 0x1bf, 0x38, 1, 6, 0 },
1294 { 0x1f9, 0x1f9, -0x1, 1, 6, 0 },
1295 { 0x23a, 0x23a, 0x2a2b, 1, 6, 0 },
1296 { 0x23e, 0x23e, 0x2a28, 1, 6, 0 },
1297 { 0x26b, 0x26b, 0x29f7, 1, 6, 0 },
1298 { 0x27d, 0x27d, 0x29e7, 1, 6, 0 },
1299 { 0x280, 0x280, -0xda, 1, 6, 0 },
1300 { 0x289, 0x289, -0x45, 1, 6, 0 },
1301 { 0x28c, 0x28c, -0x47, 1, 6, 0 },
1302 { 0x3f2, 0x3f2, 0x7, 1, 6, 0 },
1303 { 0x4cf, 0x4cf, -0xf, 1, 6, 0 },
1304 { 0x1d7d, 0x1d7d, 0xee6, 1, 6, 0 },
1305 { 0x1fb3, 0x1fb3, 0x9, 1, 6, 0 },
1306 { 0x214e, 0x214e, -0x1c, 1, 6, 0 },
1307 { 0x2184, 0x2184, -0x1, 1, 6, 0 },
1308 /* from Windows 6.1 (Win7) */
1309 { 0x23a, 0x23e, 0x0, 4, 6, 1 },
1310 { 0x250, 0x250, 0x2a1f, 2, 6, 1 },
1311 { 0x251, 0x251, 0x2a1c, 2, 6, 1 },
1312 { 0x271, 0x271, 0x29fd, 2, 6, 1 },
1313 { 0x371, 0x373, -0x1, 2, 6, 1 },
1314 { 0x377, 0x377, -0x1, 2, 6, 1 },
1315 { 0x3c2, 0x3c2, 0x0, 2, 6, 1 },
1316 { 0x3d7, 0x3d7, -0x8, 2, 6, 1 },
1317 { 0x515, 0x523, -0x1, 2, 6, 1 },
1318 /* below, -0x75fc stands for 0x8a04 and truncation */
1319 { 0x1d79, 0x1d79, -0x75fc, 2, 6, 1 },
1320 { 0x1efb, 0x1eff, -0x1, 2, 6, 1 },
1321 { 0x1fc3, 0x1ff3, 0x9, 48, 6, 1 },
1322 { 0x1fcc, 0x1ffc, 0x0, 48, 6, 1 },
1323 { 0x2c65, 0x2c65, -0x2a2b, 2, 6, 1 },
1324 { 0x2c66, 0x2c66, -0x2a28, 2, 6, 1 },
1325 { 0x2c73, 0x2c73, -0x1, 2, 6, 1 },
1326 { 0xa641, 0xa65f, -0x1, 2, 6, 1 },
1327 { 0xa663, 0xa66d, -0x1, 2, 6, 1 },
1328 { 0xa681, 0xa697, -0x1, 2, 6, 1 },
1329 { 0xa723, 0xa72f, -0x1, 2, 6, 1 },
1330 { 0xa733, 0xa76f, -0x1, 2, 6, 1 },
1331 { 0xa77a, 0xa77c, -0x1, 2, 6, 1 },
1332 { 0xa77f, 0xa787, -0x1, 2, 6, 1 },
1333 { 0xa78c, 0xa78c, -0x1, 2, 6, 1 },
1340 const struct NEWUPPERCASE
*puc
;
1342 memset((char*)uc
, 0, uc_len
);
1346 for (i
= 0; (u32
)i
< uc_len
; i
++)
1347 uc
[i
] = cpu_to_le16(i
);
1348 for (r
= 0; uc_run_table
[r
][0]; r
++) {
1349 off
= uc_run_table
[r
][2];
1350 for (i
= uc_run_table
[r
][0]; i
< uc_run_table
[r
][1]; i
++)
1351 uc
[i
] = cpu_to_le16(i
+ off
);
1353 for (r
= 0; uc_dup_table
[r
][0]; r
++)
1354 for (i
= uc_dup_table
[r
][0]; i
< uc_dup_table
[r
][1]; i
+= 2)
1355 uc
[i
+ 1] = cpu_to_le16(i
);
1356 for (r
= 0; uc_byte_table
[r
][0]; r
++) {
1357 k
= uc_byte_table
[r
][1];
1358 uc
[uc_byte_table
[r
][0]] = cpu_to_le16(k
);
1360 for (r
=0; newuppercase
[r
].first
; r
++) {
1361 puc
= &newuppercase
[r
];
1362 if ((puc
->osmajor
< UPCASE_MAJOR
)
1363 || ((puc
->osmajor
== UPCASE_MAJOR
)
1364 && (puc
->osminor
<= UPCASE_MINOR
))) {
1366 for (i
= puc
->first
; i
<= puc
->last
; i
+= puc
->step
)
1367 uc
[i
] = cpu_to_le16(i
+ off
);
1373 * Allocate and build the default upcase table
1375 * Returns the number of entries
1379 #define UPCASE_LEN 65536 /* default number of entries in upcase */
1381 u32
ntfs_upcase_build_default(ntfschar
**upcase
)
1385 *upcase
= (ntfschar
*)ntfs_malloc(UPCASE_LEN
*2);
1387 ntfs_upcase_table_build(*upcase
, UPCASE_LEN
*2);
1388 upcase_len
= UPCASE_LEN
;
1390 return (upcase_len
);
1394 * Build a table for converting to lower case
1396 * This is only meaningful when there is a single lower case
1397 * character leading to an upper case one, and currently the
1398 * only exception is the greek letter sigma which has a single
1399 * upper case glyph (code U+03A3), but two lower case glyphs
1400 * (code U+03C3 and U+03C2, the latter to be used at the end
1401 * of a word). In the following implementation the upper case
1402 * sigma will be lowercased as U+03C3.
1405 ntfschar
*ntfs_locase_table_build(const ntfschar
*uc
, u32 uc_cnt
)
1411 lc
= (ntfschar
*)ntfs_malloc(uc_cnt
*sizeof(ntfschar
));
1413 for (i
=0; i
<uc_cnt
; i
++)
1414 lc
[i
] = cpu_to_le16(i
);
1415 for (i
=0; i
<uc_cnt
; i
++) {
1416 upp
= le16_to_cpu(uc
[i
]);
1417 if ((upp
!= i
) && (upp
< uc_cnt
))
1418 lc
[upp
] = cpu_to_le16(i
);
1421 ntfs_log_error("Could not build the locase table\n");
1426 * ntfs_str2ucs - convert a string to a valid NTFS file name
1428 * @len: length of output buffer in Unicode characters
1430 * Convert the input @s string into the corresponding little endian,
1431 * 2-byte Unicode string. The length of the converted string is less
1432 * or equal to the maximum length allowed by the NTFS format (255).
1434 * If @s is NULL then return AT_UNNAMED.
1436 * On success the function returns the Unicode string in an allocated
1437 * buffer and the caller is responsible to free it when it's not needed
1440 * On error NULL is returned and errno is set to the error code.
1442 ntfschar
*ntfs_str2ucs(const char *s
, int *len
)
1444 ntfschar
*ucs
= NULL
;
1446 if (s
&& ((*len
= ntfs_mbstoucs(s
, &ucs
)) == -1)) {
1447 ntfs_log_perror("Couldn't convert '%s' to Unicode", s
);
1450 if (*len
> NTFS_MAX_NAME_LEN
) {
1452 errno
= ENAMETOOLONG
;
1455 if (!ucs
|| !*len
) {
1463 * ntfs_ucsfree - free memory allocated by ntfs_str2ucs()
1464 * @ucs input string to be freed
1466 * Free memory at @ucs and which was allocated by ntfs_str2ucs.
1468 * Return value: none.
1470 void ntfs_ucsfree(ntfschar
*ucs
)
1472 if (ucs
&& (ucs
!= AT_UNNAMED
))
1477 * Check whether a name contains no chars forbidden
1478 * for DOS or Win32 use
1480 * If @strict is TRUE, then trailing dots and spaces are forbidden.
1481 * These names are technically allowed in the Win32 namespace, but
1482 * they can be problematic. See comment for FILE_NAME_WIN32.
1484 * If there is a bad char, errno is set to EINVAL
1487 BOOL
ntfs_forbidden_chars(const ntfschar
*name
, int len
, BOOL strict
)
1492 static const u32 mainset
= (1L << ('\"' - 0x20))
1493 | (1L << ('*' - 0x20))
1494 | (1L << ('/' - 0x20))
1495 | (1L << (':' - 0x20))
1496 | (1L << ('<' - 0x20))
1497 | (1L << ('>' - 0x20))
1498 | (1L << ('?' - 0x20));
1500 forbidden
= (len
== 0) ||
1501 (strict
&& (name
[len
-1] == const_cpu_to_le16(' ') ||
1502 name
[len
-1] == const_cpu_to_le16('.')));
1503 for (i
=0; i
<len
; i
++) {
1504 ch
= le16_to_cpu(name
[i
]);
1507 && ((1L << (ch
- 0x20)) & mainset
))
1518 * Check whether a name contains no forbidden chars and
1519 * is not a reserved name for DOS or Win32 use
1521 * The reserved names are CON, PRN, AUX, NUL, COM1..COM9, LPT1..LPT9
1522 * with no suffix or any suffix.
1524 * If @strict is TRUE, then trailing dots and spaces are forbidden.
1525 * These names are technically allowed in the Win32 namespace, but
1526 * they can be problematic. See comment for FILE_NAME_WIN32.
1528 * If the name is forbidden, errno is set to EINVAL
1531 BOOL
ntfs_forbidden_names(ntfs_volume
*vol
, const ntfschar
*name
, int len
,
1536 static const ntfschar dot
= const_cpu_to_le16('.');
1537 static const ntfschar con
[] = { const_cpu_to_le16('c'),
1538 const_cpu_to_le16('o'), const_cpu_to_le16('n') };
1539 static const ntfschar prn
[] = { const_cpu_to_le16('p'),
1540 const_cpu_to_le16('r'), const_cpu_to_le16('n') };
1541 static const ntfschar aux
[] = { const_cpu_to_le16('a'),
1542 const_cpu_to_le16('u'), const_cpu_to_le16('x') };
1543 static const ntfschar nul
[] = { const_cpu_to_le16('n'),
1544 const_cpu_to_le16('u'), const_cpu_to_le16('l') };
1545 static const ntfschar com
[] = { const_cpu_to_le16('c'),
1546 const_cpu_to_le16('o'), const_cpu_to_le16('m') };
1547 static const ntfschar lpt
[] = { const_cpu_to_le16('l'),
1548 const_cpu_to_le16('p'), const_cpu_to_le16('t') };
1550 forbidden
= ntfs_forbidden_chars(name
, len
, strict
);
1551 if (!forbidden
&& (len
>= 3)) {
1553 * Rough hash check to tell whether the first couple of chars
1554 * may be one of CO PR AU NU LP or lowercase variants.
1556 h
= ((le16_to_cpu(name
[0]) & 31)*48)
1557 ^ ((le16_to_cpu(name
[1]) & 31)*165);
1558 if ((h
% 23) == 17) {
1559 /* do a full check, depending on the third char */
1560 switch (le16_to_cpu(name
[2]) & ~0x20) {
1562 if (((len
== 3) || (name
[3] == dot
))
1563 && (!ntfs_ucsncasecmp(name
, con
, 3,
1564 vol
->upcase
, vol
->upcase_len
)
1565 || !ntfs_ucsncasecmp(name
, prn
, 3,
1566 vol
->upcase
, vol
->upcase_len
)))
1570 if (((len
== 3) || (name
[3] == dot
))
1571 && !ntfs_ucsncasecmp(name
, aux
, 3,
1572 vol
->upcase
, vol
->upcase_len
))
1576 if (((len
== 3) || (name
[3] == dot
))
1577 && !ntfs_ucsncasecmp(name
, nul
, 3,
1578 vol
->upcase
, vol
->upcase_len
))
1583 && (le16_to_cpu(name
[3]) >= '1')
1584 && (le16_to_cpu(name
[3]) <= '9')
1585 && ((len
== 4) || (name
[4] == dot
))
1586 && !ntfs_ucsncasecmp(name
, com
, 3,
1587 vol
->upcase
, vol
->upcase_len
))
1592 && (le16_to_cpu(name
[3]) >= '1')
1593 && (le16_to_cpu(name
[3]) <= '9')
1594 && ((len
== 4) || (name
[4] == dot
))
1595 && !ntfs_ucsncasecmp(name
, lpt
, 3,
1596 vol
->upcase
, vol
->upcase_len
))
1609 * Check whether the same name can be used as a DOS and
1612 * The names must be the same, or the short name the uppercase
1613 * variant of the long name
1616 BOOL
ntfs_collapsible_chars(ntfs_volume
*vol
,
1617 const ntfschar
*shortname
, int shortlen
,
1618 const ntfschar
*longname
, int longlen
)
1625 collapsible
= shortlen
== longlen
;
1626 for (i
=0; collapsible
&& (i
<shortlen
); i
++) {
1627 ch
= le16_to_cpu(longname
[i
]);
1628 cs
= le16_to_cpu(shortname
[i
]);
1630 && ((ch
>= vol
->upcase_len
)
1631 || (cs
>= vol
->upcase_len
)
1632 || (vol
->upcase
[cs
] != vol
->upcase
[ch
])))
1633 collapsible
= FALSE
;
1635 return (collapsible
);
1639 * Define the character encoding to be used.
1640 * Use UTF-8 unless specified otherwise.
1643 int ntfs_set_char_encoding(const char *locale
)
1646 if (!locale
|| strstr(locale
,"utf8") || strstr(locale
,"UTF8")
1647 || strstr(locale
,"utf-8") || strstr(locale
,"UTF-8"))
1650 if (setlocale(LC_ALL
, locale
))
1653 ntfs_log_error("Invalid locale, encoding to UTF-8\n");
1656 return 0; /* always successful */
1659 #if defined(__APPLE__) || defined(__DARWIN__)
1661 int ntfs_macosx_normalize_filenames(int normalize
) {
1662 #ifdef ENABLE_NFCONV
1663 if (normalize
== 0 || normalize
== 1) {
1664 nfconvert_utf8
= normalize
;
1672 #endif /* ENABLE_NFCONV */
1675 int ntfs_macosx_normalize_utf8(const char *utf8_string
, char **target
,
1678 #ifdef ENABLE_NFCONV
1679 /* For this code to compile, the CoreFoundation framework must be fed to
1681 CFStringRef cfSourceString
;
1682 CFMutableStringRef cfMutableString
;
1683 CFRange rangeToProcess
;
1684 CFIndex requiredBufferLength
;
1685 char *result
= NULL
;
1686 int resultLength
= -1;
1688 /* Convert the UTF-8 string to a CFString. */
1689 cfSourceString
= CFStringCreateWithCString(kCFAllocatorDefault
,
1690 utf8_string
, kCFStringEncodingUTF8
);
1691 if (cfSourceString
== NULL
) {
1692 ntfs_log_error("CFStringCreateWithCString failed!\n");
1696 /* Create a mutable string from cfSourceString that we are free to
1698 cfMutableString
= CFStringCreateMutableCopy(kCFAllocatorDefault
, 0,
1700 CFRelease(cfSourceString
); /* End-of-life. */
1701 if (cfMutableString
== NULL
) {
1702 ntfs_log_error("CFStringCreateMutableCopy failed!\n");
1706 /* Normalize the mutable string to the desired normalization form. */
1707 CFStringNormalize(cfMutableString
, (composed
!= 0 ?
1708 kCFStringNormalizationFormC
: kCFStringNormalizationFormD
));
1710 /* Store the resulting string in a '\0'-terminated UTF-8 encoded char*
1712 rangeToProcess
= CFRangeMake(0, CFStringGetLength(cfMutableString
));
1713 if (CFStringGetBytes(cfMutableString
, rangeToProcess
,
1714 kCFStringEncodingUTF8
, 0, false, NULL
, 0,
1715 &requiredBufferLength
) > 0)
1717 resultLength
= sizeof(char) * (requiredBufferLength
+ 1);
1718 result
= ntfs_calloc(resultLength
);
1720 if (result
!= NULL
) {
1721 if (CFStringGetBytes(cfMutableString
, rangeToProcess
,
1722 kCFStringEncodingUTF8
, 0, false,
1723 (UInt8
*) result
, resultLength
- 1,
1724 &requiredBufferLength
) <= 0)
1726 ntfs_log_error("Could not perform UTF-8 "
1727 "conversion of normalized "
1728 "CFMutableString.\n");
1734 ntfs_log_error("Could not perform a ntfs_calloc of %d "
1735 "bytes for char *result.\n", resultLength
);
1739 ntfs_log_error("Could not perform check for required length of "
1740 "UTF-8 conversion of normalized CFMutableString.\n");
1743 CFRelease(cfMutableString
);
1745 if (result
!= NULL
) {
1747 return resultLength
- 1;
1754 #endif /* ENABLE_NFCONV */
1756 #endif /* defined(__APPLE__) || defined(__DARWIN__) */