release/src/router/ntfs-3g/libntfs-3g/unistr.c

   1 /**
   2  * unistr.c - Unicode string handling. Originated from the Linux-NTFS project.
   3  *
   4  * Copyright (c) 2000-2004 Anton Altaparmakov
   5  * Copyright (c) 2002-2009 Szabolcs Szakacsits
   6  * Copyright (c) 2008-2011 Jean-Pierre Andre
   7  * Copyright (c) 2008      Bernhard Kaindl
   8  *
   9  * This program/include file is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU General Public License as published
  11  * by the Free Software Foundation; either version 2 of the License, or
  12  * (at your option) any later version.
  13  *
  14  * This program/include file is distributed in the hope that it will be
  15  * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
  16  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17  * GNU General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU General Public License
  20  * along with this program (in the main directory of the NTFS-3G
  21  * distribution in the file COPYING); if not, write to the Free Software
  22  * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  23  */
  24
  25 #ifdef HAVE_CONFIG_H
  26 #include "config.h"
  27 #endif
  28
  29 #ifdef HAVE_STDIO_H
  30 #include <stdio.h>
  31 #endif
  32 #ifdef HAVE_STDLIB_H
  33 #include <stdlib.h>
  34 #endif
  35 #ifdef HAVE_WCHAR_H
  36 #include <wchar.h>
  37 #endif
  38 #ifdef HAVE_STRING_H
  39 #include <string.h>
  40 #endif
  41 #ifdef HAVE_ERRNO_H
  42 #include <errno.h>
  43 #endif
  44 #ifdef HAVE_LOCALE_H
  45 #include <locale.h>
  46 #endif
  47
  48 #if defined(__APPLE__) || defined(__DARWIN__)
  49 #ifdef ENABLE_NFCONV
  50 #include <CoreFoundation/CoreFoundation.h>
  51 #endif /* ENABLE_NFCONV */
  52 #endif /* defined(__APPLE__) || defined(__DARWIN__) */
  53
  54 #include "compat.h"
  55 #include "attrib.h"
  56 #include "types.h"
  57 #include "unistr.h"
  58 #include "debug.h"
  59 #include "logging.h"
  60 #include "misc.h"
  61
  62 #define NOREVBOM 0  /* JPA rejecting U+FFFE and U+FFFF, open to debate */
  63
  64 /*
  65  * IMPORTANT
  66  * =========
  67  *
  68  * All these routines assume that the Unicode characters are in little endian
  69  * encoding inside the strings!!!
  70  */
  71
  72 static int use_utf8 = 1; /* use UTF-8 encoding for file names */
  73
  74 #if defined(__APPLE__) || defined(__DARWIN__)
  75 #ifdef ENABLE_NFCONV
  76 /**
  77  * This variable controls whether or not automatic normalization form conversion
  78  * should be performed when translating NTFS unicode file names to UTF-8.
  79  * Defaults to on, but can be controlled from the outside using the function
  80  *   int ntfs_macosx_normalize_filenames(int normalize);
  81  */
  82 static int nfconvert_utf8 = 1;
  83 #endif /* ENABLE_NFCONV */
  84 #endif /* defined(__APPLE__) || defined(__DARWIN__) */
  85
  86 /*
  87  * This is used by the name collation functions to quickly determine what
  88  * characters are (in)valid.
  89  */
  90 #if 0
  91 static const u8 legal_ansi_char_array[0x40] = {
  92         0x00, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
  93         0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
  94
  95         0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
  96         0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
  97
  98         0x17, 0x07, 0x18, 0x17, 0x17, 0x17, 0x17, 0x17,
  99         0x17, 0x17, 0x18, 0x16, 0x16, 0x17, 0x07, 0x00,
 100
 101         0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17,
 102         0x17, 0x17, 0x04, 0x16, 0x18, 0x16, 0x18, 0x18,
 103 };
 104 #endif
 105
 106 /**
 107  * ntfs_names_are_equal - compare two Unicode names for equality
 108  * @s1:                 name to compare to @s2
 109  * @s1_len:             length in Unicode characters of @s1
 110  * @s2:                 name to compare to @s1
 111  * @s2_len:             length in Unicode characters of @s2
 112  * @ic:                 ignore case bool
 113  * @upcase:             upcase table (only if @ic == IGNORE_CASE)
 114  * @upcase_size:        length in Unicode characters of @upcase (if present)
 115  *
 116  * Compare the names @s1 and @s2 and return TRUE (1) if the names are
 117  * identical, or FALSE (0) if they are not identical. If @ic is IGNORE_CASE,
 118  * the @upcase table is used to perform a case insensitive comparison.
 119  */
 120 BOOL ntfs_names_are_equal(const ntfschar *s1, size_t s1_len,
 121                 const ntfschar *s2, size_t s2_len,
 122                 const IGNORE_CASE_BOOL ic,
 123                 const ntfschar *upcase, const u32 upcase_size)
 124 {
 125         if (s1_len != s2_len)
 126                 return FALSE;
 127         if (!s1_len)
 128                 return TRUE;
 129         if (ic == CASE_SENSITIVE)
 130                 return ntfs_ucsncmp(s1, s2, s1_len) ? FALSE: TRUE;
 131         return ntfs_ucsncasecmp(s1, s2, s1_len, upcase, upcase_size) ? FALSE:
 132                                                                        TRUE;
 133 }
 134
 135 /*
 136  * ntfs_names_full_collate() fully collate two Unicode names
 137  *
 138  * @name1:      first Unicode name to compare
 139  * @name1_len:  length of first Unicode name to compare
 140  * @name2:      second Unicode name to compare
 141  * @name2_len:  length of second Unicode name to compare
 142  * @ic:         either CASE_SENSITIVE or IGNORE_CASE
 143  * @upcase:     upcase table (ignored if @ic is CASE_SENSITIVE)
 144  * @upcase_len: upcase table size (ignored if @ic is CASE_SENSITIVE)
 145  *
 146  *  -1 if the first name collates before the second one,
 147  *   0 if the names match,
 148  *   1 if the second name collates before the first one, or
 149  *
 150  */
 151 int ntfs_names_full_collate(const ntfschar *name1, const u32 name1_len,
 152                 const ntfschar *name2, const u32 name2_len,
 153                 const IGNORE_CASE_BOOL ic, const ntfschar *upcase,
 154                 const u32 upcase_len)
 155 {
 156         u32 cnt;
 157         u16 c1, c2;
 158         u16 u1, u2;
 159
 160 #ifdef DEBUG
 161         if (!name1 || !name2 || (ic && (!upcase || !upcase_len))) {
 162                 ntfs_log_debug("ntfs_names_collate received NULL pointer!\n");
 163                 exit(1);
 164         }
 165 #endif
 166         cnt = min(name1_len, name2_len);
 167         if (cnt > 0) {
 168                 if (ic == CASE_SENSITIVE) {
 169                         while (--cnt && (*name1 == *name2)) {
 170                                 name1++;
 171                                 name2++;
 172                         }
 173                         u1 = c1 = le16_to_cpu(*name1);
 174                         u2 = c2 = le16_to_cpu(*name2);
 175                         if (u1 < upcase_len)
 176                                 u1 = le16_to_cpu(upcase[u1]);
 177                         if (u2 < upcase_len)
 178                                 u2 = le16_to_cpu(upcase[u2]);
 179                         if ((u1 == u2) && cnt)
 180                                 do {
 181                                         name1++;
 182                                         u1 = le16_to_cpu(*name1);
 183                                         name2++;
 184                                         u2 = le16_to_cpu(*name2);
 185                                         if (u1 < upcase_len)
 186                                                 u1 = le16_to_cpu(upcase[u1]);
 187                                         if (u2 < upcase_len)
 188                                                 u2 = le16_to_cpu(upcase[u2]);
 189                                 } while ((u1 == u2) && --cnt);
 190                         if (u1 < u2)
 191                                 return -1;
 192                         if (u1 > u2)
 193                                 return 1;
 194                         if (name1_len < name2_len)
 195                                 return -1;
 196                         if (name1_len > name2_len)
 197                                 return 1;
 198                         if (c1 < c2)
 199                                 return -1;
 200                         if (c1 > c2)
 201                                 return 1;
 202                 } else {
 203                         do {
 204                                 u1 = c1 = le16_to_cpu(*name1);
 205                                 name1++;
 206                                 u2 = c2 = le16_to_cpu(*name2);
 207                                 name2++;
 208                                 if (u1 < upcase_len)
 209                                         u1 = le16_to_cpu(upcase[u1]);
 210                                 if (u2 < upcase_len)
 211                                         u2 = le16_to_cpu(upcase[u2]);
 212                         } while ((u1 == u2) && --cnt);
 213                         if (u1 < u2)
 214                                 return -1;
 215                         if (u1 > u2)
 216                                 return 1;
 217                         if (name1_len < name2_len)
 218                                 return -1;
 219                         if (name1_len > name2_len)
 220                                 return 1;
 221                 }
 222         } else {
 223                 if (name1_len < name2_len)
 224                         return -1;
 225                 if (name1_len > name2_len)
 226                         return 1;
 227         }
 228         return 0;
 229 }
 230
 231 /**
 232  * ntfs_ucsncmp - compare two little endian Unicode strings
 233  * @s1:         first string
 234  * @s2:         second string
 235  * @n:          maximum unicode characters to compare
 236  *
 237  * Compare the first @n characters of the Unicode strings @s1 and @s2,
 238  * The strings in little endian format and appropriate le16_to_cpu()
 239  * conversion is performed on non-little endian machines.
 240  *
 241  * The function returns an integer less than, equal to, or greater than zero
 242  * if @s1 (or the first @n Unicode characters thereof) is found, respectively,
 243  * to be less than, to match, or be greater than @s2.
 244  */
 245 int ntfs_ucsncmp(const ntfschar *s1, const ntfschar *s2, size_t n)
 246 {
 247         ntfschar c1, c2;
 248         size_t i;
 249
 250 #ifdef DEBUG
 251         if (!s1 || !s2) {
 252                 ntfs_log_debug("ntfs_wcsncmp() received NULL pointer!\n");
 253                 exit(1);
 254         }
 255 #endif
 256         for (i = 0; i < n; ++i) {
 257                 c1 = le16_to_cpu(s1[i]);
 258                 c2 = le16_to_cpu(s2[i]);
 259                 if (c1 < c2)
 260                         return -1;
 261                 if (c1 > c2)
 262                         return 1;
 263                 if (!c1)
 264                         break;
 265         }
 266         return 0;
 267 }
 268
 269 /**
 270  * ntfs_ucsncasecmp - compare two little endian Unicode strings, ignoring case
 271  * @s1:                 first string
 272  * @s2:                 second string
 273  * @n:                  maximum unicode characters to compare
 274  * @upcase:             upcase table
 275  * @upcase_size:        upcase table size in Unicode characters
 276  *
 277  * Compare the first @n characters of the Unicode strings @s1 and @s2,
 278  * ignoring case. The strings in little endian format and appropriate
 279  * le16_to_cpu() conversion is performed on non-little endian machines.
 280  *
 281  * Each character is uppercased using the @upcase table before the comparison.
 282  *
 283  * The function returns an integer less than, equal to, or greater than zero
 284  * if @s1 (or the first @n Unicode characters thereof) is found, respectively,
 285  * to be less than, to match, or be greater than @s2.
 286  */
 287 int ntfs_ucsncasecmp(const ntfschar *s1, const ntfschar *s2, size_t n,
 288                 const ntfschar *upcase, const u32 upcase_size)
 289 {
 290         u16 c1, c2;
 291         size_t i;
 292
 293 #ifdef DEBUG
 294         if (!s1 || !s2 || !upcase) {
 295                 ntfs_log_debug("ntfs_wcsncasecmp() received NULL pointer!\n");
 296                 exit(1);
 297         }
 298 #endif
 299         for (i = 0; i < n; ++i) {
 300                 if ((c1 = le16_to_cpu(s1[i])) < upcase_size)
 301                         c1 = le16_to_cpu(upcase[c1]);
 302                 if ((c2 = le16_to_cpu(s2[i])) < upcase_size)
 303                         c2 = le16_to_cpu(upcase[c2]);
 304                 if (c1 < c2)
 305                         return -1;
 306                 if (c1 > c2)
 307                         return 1;
 308                 if (!c1)
 309                         break;
 310         }
 311         return 0;
 312 }
 313
 314 /**
 315  * ntfs_ucsnlen - determine the length of a little endian Unicode string
 316  * @s:          pointer to Unicode string
 317  * @maxlen:     maximum length of string @s
 318  *
 319  * Return the number of Unicode characters in the little endian Unicode
 320  * string @s up to a maximum of maxlen Unicode characters, not including
 321  * the terminating (ntfschar)'\0'. If there is no (ntfschar)'\0' between @s
 322  * and @s + @maxlen, @maxlen is returned.
 323  *
 324  * This function never looks beyond @s + @maxlen.
 325  */
 326 u32 ntfs_ucsnlen(const ntfschar *s, u32 maxlen)
 327 {
 328         u32 i;
 329
 330         for (i = 0; i < maxlen; i++) {
 331                 if (!le16_to_cpu(s[i]))
 332                         break;
 333         }
 334         return i;
 335 }
 336
 337 /**
 338  * ntfs_ucsndup - duplicate little endian Unicode string
 339  * @s:          pointer to Unicode string
 340  * @maxlen:     maximum length of string @s
 341  *
 342  * Return a pointer to a new little endian Unicode string which is a duplicate
 343  * of the string s.  Memory for the new string is obtained with ntfs_malloc(3),
 344  * and can be freed with free(3).
 345  *
 346  * A maximum of @maxlen Unicode characters are copied and a terminating
 347  * (ntfschar)'\0' little endian Unicode character is added.
 348  *
 349  * This function never looks beyond @s + @maxlen.
 350  *
 351  * Return a pointer to the new little endian Unicode string on success and NULL
 352  * on failure with errno set to the error code.
 353  */
 354 ntfschar *ntfs_ucsndup(const ntfschar *s, u32 maxlen)
 355 {
 356         ntfschar *dst;
 357         u32 len;
 358
 359         len = ntfs_ucsnlen(s, maxlen);
 360         dst = ntfs_malloc((len + 1) * sizeof(ntfschar));
 361         if (dst) {
 362                 memcpy(dst, s, len * sizeof(ntfschar));
 363                 dst[len] = cpu_to_le16(L'\0');
 364         }
 365         return dst;
 366 }
 367
 368 /**
 369  * ntfs_name_upcase - Map an Unicode name to its uppercase equivalent
 370  * @name:
 371  * @name_len:
 372  * @upcase:
 373  * @upcase_len:
 374  *
 375  * Description...
 376  *
 377  * Returns:
 378  */
 379 void ntfs_name_upcase(ntfschar *name, u32 name_len, const ntfschar *upcase,
 380                 const u32 upcase_len)
 381 {
 382         u32 i;
 383         u16 u;
 384
 385         for (i = 0; i < name_len; i++)
 386                 if ((u = le16_to_cpu(name[i])) < upcase_len)
 387                         name[i] = upcase[u];
 388 }
 389
 390 /**
 391  * ntfs_name_locase - Map a Unicode name to its lowercase equivalent
 392  */
 393 void ntfs_name_locase(ntfschar *name, u32 name_len, const ntfschar *locase,
 394                 const u32 locase_len)
 395 {
 396         u32 i;
 397         u16 u;
 398
 399         if (locase)
 400                 for (i = 0; i < name_len; i++)
 401                         if ((u = le16_to_cpu(name[i])) < locase_len)
 402                                 name[i] = locase[u];
 403 }
 404
 405 /**
 406  * ntfs_file_value_upcase - Convert a filename to upper case
 407  * @file_name_attr:
 408  * @upcase:
 409  * @upcase_len:
 410  *
 411  * Description...
 412  *
 413  * Returns:
 414  */
 415 void ntfs_file_value_upcase(FILE_NAME_ATTR *file_name_attr,
 416                 const ntfschar *upcase, const u32 upcase_len)
 417 {
 418         ntfs_name_upcase((ntfschar*)&file_name_attr->file_name,
 419                         file_name_attr->file_name_length, upcase, upcase_len);
 420 }
 421
 422 /*
 423    NTFS uses Unicode (UTF-16LE [NTFS-3G uses UCS-2LE, which is enough
 424    for now]) for path names, but the Unicode code points need to be
 425    converted before a path can be accessed under NTFS. For 7 bit ASCII/ANSI,
 426    glibc does this even without a locale in a hard-coded fashion as that
 427    appears to be is easy because the low 7-bit ASCII range appears to be
 428    available in all charsets but it does not convert anything if
 429    there was some error with the locale setup or none set up like
 430    when mount is called during early boot where he (by policy) do
 431    not use locales (and may be not available if /usr is not yet mounted),
 432    so this patch fixes the resulting issues for systems which use
 433    UTF-8 and for others, specifying the locale in fstab brings them
 434    the encoding which they want.
 435
 436    If no locale is defined or there was a problem with setting one
 437    up and whenever nl_langinfo(CODESET) returns a sting starting with
 438    "ANSI", use an internal UCS-2LE <-> UTF-8 codeset converter to fix
 439    the bug where NTFS-3G does not show any path names which include
 440    international characters!!! (and also fails on creating them) as result.
 441
 442    Author: Bernhard Kaindl <bk@suse.de>
 443    Jean-Pierre Andre made it compliant with RFC3629/RFC2781.
 444 */
 445
 446 /*
 447  * Return the amount of 8-bit elements in UTF-8 needed (without the terminating
 448  * null) to store a given UTF-16LE string.
 449  *
 450  * Return -1 with errno set if string has invalid byte sequence or too long.
 451  */
 452 static int utf16_to_utf8_size(const ntfschar *ins, const int ins_len, int outs_len)
 453 {
 454         int i, ret = -1;
 455         int count = 0;
 456         BOOL surrog;
 457
 458         surrog = FALSE;
 459         for (i = 0; i < ins_len && ins[i]; i++) {
 460                 unsigned short c = le16_to_cpu(ins[i]);
 461                 if (surrog) {
 462                         if ((c >= 0xdc00) && (c < 0xe000)) {
 463                                 surrog = FALSE;
 464                                 count += 4;
 465                         } else
 466                                 goto fail;
 467                 } else
 468                         if (c < 0x80)
 469                                 count++;
 470                         else if (c < 0x800)
 471                                 count += 2;
 472                         else if (c < 0xd800)
 473                                 count += 3;
 474                         else if (c < 0xdc00)
 475                                 surrog = TRUE;
 476 #if NOREVBOM
 477                         else if ((c >= 0xe000) && (c < 0xfffe))
 478 #else
 479                         else if (c >= 0xe000)
 480 #endif
 481                                 count += 3;
 482                         else
 483                                 goto fail;
 484                 if (count > outs_len) {
 485                         errno = ENAMETOOLONG;
 486                         goto out;
 487                 }
 488         }
 489         if (surrog)
 490                 goto fail;
 491
 492         ret = count;
 493 out:
 494         return ret;
 495 fail:
 496         errno = EILSEQ;
 497         goto out;
 498 }
 499
 500 /*
 501  * ntfs_utf16_to_utf8 - convert a little endian UTF16LE string to an UTF-8 string
 502  * @ins:        input utf16 string buffer
 503  * @ins_len:    length of input string in utf16 characters
 504  * @outs:       on return contains the (allocated) output multibyte string
 505  * @outs_len:   length of output buffer in bytes
 506  *
 507  * Return -1 with errno set if string has invalid byte sequence or too long.
 508  */
 509 static int ntfs_utf16_to_utf8(const ntfschar *ins, const int ins_len,
 510                               char **outs, int outs_len)
 511 {
 512 #if defined(__APPLE__) || defined(__DARWIN__)
 513 #ifdef ENABLE_NFCONV
 514         char *original_outs_value = *outs;
 515         int original_outs_len = outs_len;
 516 #endif /* ENABLE_NFCONV */
 517 #endif /* defined(__APPLE__) || defined(__DARWIN__) */
 518
 519         char *t;
 520         int i, size, ret = -1;
 521         int halfpair;
 522
 523         halfpair = 0;
 524         if (!*outs)
 525                 outs_len = PATH_MAX;
 526
 527         size = utf16_to_utf8_size(ins, ins_len, outs_len);
 528
 529         if (size < 0)
 530                 goto out;
 531
 532         if (!*outs) {
 533                 outs_len = size + 1;
 534                 *outs = ntfs_malloc(outs_len);
 535                 if (!*outs)
 536                         goto out;
 537         }
 538
 539         t = *outs;
 540
 541         for (i = 0; i < ins_len && ins[i]; i++) {
 542             unsigned short c = le16_to_cpu(ins[i]);
 543                         /* size not double-checked */
 544                 if (halfpair) {
 545                         if ((c >= 0xdc00) && (c < 0xe000)) {
 546                                 *t++ = 0xf0 + (((halfpair + 64) >> 8) & 7);
 547                                 *t++ = 0x80 + (((halfpair + 64) >> 2) & 63);
 548                                 *t++ = 0x80 + ((c >> 6) & 15) + ((halfpair & 3) << 4);
 549                                 *t++ = 0x80 + (c & 63);
 550                                 halfpair = 0;
 551                         } else
 552                                 goto fail;
 553                 } else if (c < 0x80) {
 554                         *t++ = c;
 555                 } else {
 556                         if (c < 0x800) {
 557                                 *t++ = (0xc0 | ((c >> 6) & 0x3f));
 558                                 *t++ = 0x80 | (c & 0x3f);
 559                         } else if (c < 0xd800) {
 560                                 *t++ = 0xe0 | (c >> 12);
 561                                 *t++ = 0x80 | ((c >> 6) & 0x3f);
 562                                 *t++ = 0x80 | (c & 0x3f);
 563                         } else if (c < 0xdc00)
 564                                 halfpair = c;
 565                         else if (c >= 0xe000) {
 566                                 *t++ = 0xe0 | (c >> 12);
 567                                 *t++ = 0x80 | ((c >> 6) & 0x3f);
 568                                 *t++ = 0x80 | (c & 0x3f);
 569                         } else
 570                                 goto fail;
 571                 }
 572         }
 573         *t = '\0';
 574
 575 #if defined(__APPLE__) || defined(__DARWIN__)
 576 #ifdef ENABLE_NFCONV
 577         if(nfconvert_utf8 && (t - *outs) > 0) {
 578                 char *new_outs = NULL;
 579                 int new_outs_len = ntfs_macosx_normalize_utf8(*outs, &new_outs, 0); // Normalize to decomposed form
 580                 if(new_outs_len >= 0 && new_outs != NULL) {
 581                         if(original_outs_value != *outs) {
 582                                 // We have allocated outs ourselves.
 583                                 free(*outs);
 584                                 *outs = new_outs;
 585                                 t = *outs + new_outs_len;
 586                         }
 587                         else {
 588                                 // We need to copy new_outs into the fixed outs buffer.
 589                                 memset(*outs, 0, original_outs_len);
 590                                 strncpy(*outs, new_outs, original_outs_len-1);
 591                                 t = *outs + original_outs_len;
 592                                 free(new_outs);
 593                         }
 594                 }
 595                 else {
 596                         ntfs_log_error("Failed to normalize NTFS string to UTF-8 NFD: %s\n", *outs);
 597                         ntfs_log_error("  new_outs=0x%p\n", new_outs);
 598                         ntfs_log_error("  new_outs_len=%d\n", new_outs_len);
 599                 }
 600         }
 601 #endif /* ENABLE_NFCONV */
 602 #endif /* defined(__APPLE__) || defined(__DARWIN__) */
 603
 604         ret = t - *outs;
 605 out:
 606         return ret;
 607 fail:
 608         errno = EILSEQ;
 609         goto out;
 610 }
 611
 612 /*
 613  * Return the amount of 16-bit elements in UTF-16LE needed
 614  * (without the terminating null) to store given UTF-8 string.
 615  *
 616  * Return -1 with errno set if it's longer than PATH_MAX or string is invalid.
 617  *
 618  * Note: This does not check whether the input sequence is a valid utf8 string,
 619  *       and should be used only in context where such check is made!
 620  */
 621 static int utf8_to_utf16_size(const char *s)
 622 {
 623         int ret = -1;
 624         unsigned int byte;
 625         size_t count = 0;
 626
 627         while ((byte = *((const unsigned char *)s++))) {
 628                 if (++count >= PATH_MAX)
 629                         goto fail;
 630                 if (byte >= 0xc0) {
 631                         if (byte >= 0xF5) {
 632                                 errno = EILSEQ;
 633                                 goto out;
 634                         }
 635                         if (!*s)
 636                                 break;
 637                         if (byte >= 0xC0)
 638                                 s++;
 639                         if (!*s)
 640                                 break;
 641                         if (byte >= 0xE0)
 642                                 s++;
 643                         if (!*s)
 644                                 break;
 645                         if (byte >= 0xF0) {
 646                                 s++;
 647                                 if (++count >= PATH_MAX)
 648                                         goto fail;
 649                         }
 650                 }
 651         }
 652         ret = count;
 653 out:
 654         return ret;
 655 fail:
 656         errno = ENAMETOOLONG;
 657         goto out;
 658 }
 659 /*
 660  * This converts one UTF-8 sequence to cpu-endian Unicode value
 661  * within range U+0 .. U+10ffff and excluding U+D800 .. U+DFFF
 662  *
 663  * Return the number of used utf8 bytes or -1 with errno set
 664  * if sequence is invalid.
 665  */
 666 static int utf8_to_unicode(u32 *wc, const char *s)
 667 {
 668         unsigned int byte = *((const unsigned char *)s);
 669
 670                                         /* single byte */
 671         if (byte == 0) {
 672                 *wc = (u32) 0;
 673                 return 0;
 674         } else if (byte < 0x80) {
 675                 *wc = (u32) byte;
 676                 return 1;
 677                                         /* double byte */
 678         } else if (byte < 0xc2) {
 679                 goto fail;
 680         } else if (byte < 0xE0) {
 681                 if ((s[1] & 0xC0) == 0x80) {
 682                         *wc = ((u32)(byte & 0x1F) << 6)
 683                             | ((u32)(s[1] & 0x3F));
 684                         return 2;
 685                 } else
 686                         goto fail;
 687                                         /* three-byte */
 688         } else if (byte < 0xF0) {
 689                 if (((s[1] & 0xC0) == 0x80) && ((s[2] & 0xC0) == 0x80)) {
 690                         *wc = ((u32)(byte & 0x0F) << 12)
 691                             | ((u32)(s[1] & 0x3F) << 6)
 692                             | ((u32)(s[2] & 0x3F));
 693                         /* Check valid ranges */
 694 #if NOREVBOM
 695                         if (((*wc >= 0x800) && (*wc <= 0xD7FF))
 696                           || ((*wc >= 0xe000) && (*wc <= 0xFFFD)))
 697                                 return 3;
 698 #else
 699                         if (((*wc >= 0x800) && (*wc <= 0xD7FF))
 700                           || ((*wc >= 0xe000) && (*wc <= 0xFFFF)))
 701                                 return 3;
 702 #endif
 703                 }
 704                 goto fail;
 705                                         /* four-byte */
 706         } else if (byte < 0xF5) {
 707                 if (((s[1] & 0xC0) == 0x80) && ((s[2] & 0xC0) == 0x80)
 708                   && ((s[3] & 0xC0) == 0x80)) {
 709                         *wc = ((u32)(byte & 0x07) << 18)
 710                             | ((u32)(s[1] & 0x3F) << 12)
 711                             | ((u32)(s[2] & 0x3F) << 6)
 712                             | ((u32)(s[3] & 0x3F));
 713                         /* Check valid ranges */
 714                         if ((*wc <= 0x10ffff) && (*wc >= 0x10000))
 715                                 return 4;
 716                 }
 717                 goto fail;
 718         }
 719 fail:
 720         errno = EILSEQ;
 721         return -1;
 722 }
 723
 724 /**
 725  * ntfs_utf8_to_utf16 - convert a UTF-8 string to a UTF-16LE string
 726  * @ins:        input multibyte string buffer
 727  * @outs:       on return contains the (allocated) output utf16 string
 728  * @outs_len:   length of output buffer in utf16 characters
 729  *
 730  * Return -1 with errno set.
 731  */
 732 static int ntfs_utf8_to_utf16(const char *ins, ntfschar **outs)
 733 {
 734 #if defined(__APPLE__) || defined(__DARWIN__)
 735 #ifdef ENABLE_NFCONV
 736         char *new_ins = NULL;
 737         if(nfconvert_utf8) {
 738                 int new_ins_len;
 739                 new_ins_len = ntfs_macosx_normalize_utf8(ins, &new_ins, 1); // Normalize to composed form
 740                 if(new_ins_len >= 0)
 741                         ins = new_ins;
 742                 else
 743                         ntfs_log_error("Failed to normalize NTFS string to UTF-8 NFC: %s\n", ins);
 744         }
 745 #endif /* ENABLE_NFCONV */
 746 #endif /* defined(__APPLE__) || defined(__DARWIN__) */
 747         const char *t = ins;
 748         u32 wc;
 749         BOOL allocated;
 750         ntfschar *outpos;
 751         int shorts, ret = -1;
 752
 753         shorts = utf8_to_utf16_size(ins);
 754         if (shorts < 0)
 755                 goto fail;
 756
 757         allocated = FALSE;
 758         if (!*outs) {
 759                 *outs = ntfs_malloc((shorts + 1) * sizeof(ntfschar));
 760                 if (!*outs)
 761                         goto fail;
 762                 allocated = TRUE;
 763         }
 764
 765         outpos = *outs;
 766
 767         while(1) {
 768                 int m  = utf8_to_unicode(&wc, t);
 769                 if (m <= 0) {
 770                         if (m < 0) {
 771                                 /* do not leave space allocated if failed */
 772                                 if (allocated) {
 773                                         free(*outs);
 774                                         *outs = (ntfschar*)NULL;
 775                                 }
 776                                 goto fail;
 777                         }
 778                         *outpos++ = const_cpu_to_le16(0);
 779                         break;
 780                 }
 781                 if (wc < 0x10000)
 782                         *outpos++ = cpu_to_le16(wc);
 783                 else {
 784                         wc -= 0x10000;
 785                         *outpos++ = cpu_to_le16((wc >> 10) + 0xd800);
 786                         *outpos++ = cpu_to_le16((wc & 0x3ff) + 0xdc00);
 787                 }
 788                 t += m;
 789         }
 790
 791         ret = --outpos - *outs;
 792 fail:
 793 #if defined(__APPLE__) || defined(__DARWIN__)
 794 #ifdef ENABLE_NFCONV
 795         if(new_ins != NULL)
 796                 free(new_ins);
 797 #endif /* ENABLE_NFCONV */
 798 #endif /* defined(__APPLE__) || defined(__DARWIN__) */
 799         return ret;
 800 }
 801
 802 /**
 803  * ntfs_ucstombs - convert a little endian Unicode string to a multibyte string
 804  * @ins:        input Unicode string buffer
 805  * @ins_len:    length of input string in Unicode characters
 806  * @outs:       on return contains the (allocated) output multibyte string
 807  * @outs_len:   length of output buffer in bytes
 808  *
 809  * Convert the input little endian, 2-byte Unicode string @ins, of length
 810  * @ins_len into the multibyte string format dictated by the current locale.
 811  *
 812  * If *@outs is NULL, the function allocates the string and the caller is
 813  * responsible for calling free(*@outs); when finished with it.
 814  *
 815  * On success the function returns the number of bytes written to the output
 816  * string *@outs (>= 0), not counting the terminating NULL byte. If the output
 817  * string buffer was allocated, *@outs is set to it.
 818  *
 819  * On error, -1 is returned, and errno is set to the error code. The following
 820  * error codes can be expected:
 821  *      EINVAL          Invalid arguments (e.g. @ins or @outs is NULL).
 822  *      EILSEQ          The input string cannot be represented as a multibyte
 823  *                      sequence according to the current locale.
 824  *      ENAMETOOLONG    Destination buffer is too small for input string.
 825  *      ENOMEM          Not enough memory to allocate destination buffer.
 826  */
 827 int ntfs_ucstombs(const ntfschar *ins, const int ins_len, char **outs,
 828                 int outs_len)
 829 {
 830         char *mbs;
 831         int mbs_len;
 832 #ifdef MB_CUR_MAX
 833         wchar_t wc;
 834         int i, o;
 835         int cnt = 0;
 836 #ifdef HAVE_MBSINIT
 837         mbstate_t mbstate;
 838 #endif
 839 #endif /* MB_CUR_MAX */
 840
 841         if (!ins || !outs) {
 842                 errno = EINVAL;
 843                 return -1;
 844         }
 845         mbs = *outs;
 846         mbs_len = outs_len;
 847         if (mbs && !mbs_len) {
 848                 errno = ENAMETOOLONG;
 849                 return -1;
 850         }
 851         if (use_utf8)
 852                 return ntfs_utf16_to_utf8(ins, ins_len, outs, outs_len);
 853 #ifdef MB_CUR_MAX
 854         if (!mbs) {
 855                 mbs_len = (ins_len + 1) * MB_CUR_MAX;
 856                 mbs = ntfs_malloc(mbs_len);
 857                 if (!mbs)
 858                         return -1;
 859         }
 860 #ifdef HAVE_MBSINIT
 861         memset(&mbstate, 0, sizeof(mbstate));
 862 #else
 863         wctomb(NULL, 0);
 864 #endif
 865         for (i = o = 0; i < ins_len; i++) {
 866                 /* Reallocate memory if necessary or abort. */
 867                 if ((int)(o + MB_CUR_MAX) > mbs_len) {
 868                         char *tc;
 869                         if (mbs == *outs) {
 870                                 errno = ENAMETOOLONG;
 871                                 return -1;
 872                         }
 873                         tc = ntfs_malloc((mbs_len + 64) & ~63);
 874                         if (!tc)
 875                                 goto err_out;
 876                         memcpy(tc, mbs, mbs_len);
 877                         mbs_len = (mbs_len + 64) & ~63;
 878                         free(mbs);
 879                         mbs = tc;
 880                 }
 881                 /* Convert the LE Unicode character to a CPU wide character. */
 882                 wc = (wchar_t)le16_to_cpu(ins[i]);
 883                 if (!wc)
 884                         break;
 885                 /* Convert the CPU endian wide character to multibyte. */
 886 #ifdef HAVE_MBSINIT
 887                 cnt = wcrtomb(mbs + o, wc, &mbstate);
 888 #else
 889                 cnt = wctomb(mbs + o, wc);
 890 #endif
 891                 if (cnt == -1)
 892                         goto err_out;
 893                 if (cnt <= 0) {
 894                         ntfs_log_debug("Eeek. cnt <= 0, cnt = %i\n", cnt);
 895                         errno = EINVAL;
 896                         goto err_out;
 897                 }
 898                 o += cnt;
 899         }
 900 #ifdef HAVE_MBSINIT
 901         /* Make sure we are back in the initial state. */
 902         if (!mbsinit(&mbstate)) {
 903                 ntfs_log_debug("Eeek. mbstate not in initial state!\n");
 904                 errno = EILSEQ;
 905                 goto err_out;
 906         }
 907 #endif
 908         /* Now write the NULL character. */
 909         mbs[o] = '\0';
 910         if (*outs != mbs)
 911                 *outs = mbs;
 912         return o;
 913 err_out:
 914         if (mbs != *outs) {
 915                 int eo = errno;
 916                 free(mbs);
 917                 errno = eo;
 918         }
 919 #else /* MB_CUR_MAX */
 920         errno = EILSEQ;
 921 #endif /* MB_CUR_MAX */
 922         return -1;
 923 }
 924
 925 /**
 926  * ntfs_mbstoucs - convert a multibyte string to a little endian Unicode string
 927  * @ins:        input multibyte string buffer
 928  * @outs:       on return contains the (allocated) output Unicode string
 929  *
 930  * Convert the input multibyte string @ins, from the current locale into the
 931  * corresponding little endian, 2-byte Unicode string.
 932  *
 933  * The function allocates the string and the caller is responsible for calling
 934  * free(*@outs); when finished with it.
 935  *
 936  * On success the function returns the number of Unicode characters written to
 937  * the output string *@outs (>= 0), not counting the terminating Unicode NULL
 938  * character.
 939  *
 940  * On error, -1 is returned, and errno is set to the error code. The following
 941  * error codes can be expected:
 942  *      EINVAL          Invalid arguments (e.g. @ins or @outs is NULL).
 943  *      EILSEQ          The input string cannot be represented as a Unicode
 944  *                      string according to the current locale.
 945  *      ENAMETOOLONG    Destination buffer is too small for input string.
 946  *      ENOMEM          Not enough memory to allocate destination buffer.
 947  */
 948 int ntfs_mbstoucs(const char *ins, ntfschar **outs)
 949 {
 950 #ifdef MB_CUR_MAX
 951         ntfschar *ucs;
 952         const char *s;
 953         wchar_t wc;
 954         int i, o, cnt, ins_len, ucs_len, ins_size;
 955 #ifdef HAVE_MBSINIT
 956         mbstate_t mbstate;
 957 #endif
 958 #endif /* MB_CUR_MAX */
 959
 960         if (!ins || !outs) {
 961                 errno = EINVAL;
 962                 return -1;
 963         }
 964
 965         if (use_utf8)
 966                 return ntfs_utf8_to_utf16(ins, outs);
 967
 968 #ifdef MB_CUR_MAX
 969         /* Determine the size of the multi-byte string in bytes. */
 970         ins_size = strlen(ins);
 971         /* Determine the length of the multi-byte string. */
 972         s = ins;
 973 #if defined(HAVE_MBSINIT)
 974         memset(&mbstate, 0, sizeof(mbstate));
 975         ins_len = mbsrtowcs(NULL, (const char **)&s, 0, &mbstate);
 976 #ifdef __CYGWIN32__
 977         if (!ins_len && *ins) {
 978                 /* Older Cygwin had broken mbsrtowcs() implementation. */
 979                 ins_len = strlen(ins);
 980         }
 981 #endif
 982 #elif !defined(DJGPP)
 983         ins_len = mbstowcs(NULL, s, 0);
 984 #else
 985         /* Eeek!!! DJGPP has broken mbstowcs() implementation!!! */
 986         ins_len = strlen(ins);
 987 #endif
 988         if (ins_len == -1)
 989                 return ins_len;
 990 #ifdef HAVE_MBSINIT
 991         if ((s != ins) || !mbsinit(&mbstate)) {
 992 #else
 993         if (s != ins) {
 994 #endif
 995                 errno = EILSEQ;
 996                 return -1;
 997         }
 998         /* Add the NULL terminator. */
 999         ins_len++;
1000         ucs_len = ins_len;
1001         ucs = ntfs_malloc(ucs_len * sizeof(ntfschar));
1002         if (!ucs)
1003                 return -1;
1004 #ifdef HAVE_MBSINIT
1005         memset(&mbstate, 0, sizeof(mbstate));
1006 #else
1007         mbtowc(NULL, NULL, 0);
1008 #endif
1009         for (i = o = cnt = 0; i < ins_size; i += cnt, o++) {
1010                 /* Reallocate memory if necessary. */
1011                 if (o >= ucs_len) {
1012                         ntfschar *tc;
1013                         ucs_len = (ucs_len * sizeof(ntfschar) + 64) & ~63;
1014                         tc = realloc(ucs, ucs_len);
1015                         if (!tc)
1016                                 goto err_out;
1017                         ucs = tc;
1018                         ucs_len /= sizeof(ntfschar);
1019                 }
1020                 /* Convert the multibyte character to a wide character. */
1021 #ifdef HAVE_MBSINIT
1022                 cnt = mbrtowc(&wc, ins + i, ins_size - i, &mbstate);
1023 #else
1024                 cnt = mbtowc(&wc, ins + i, ins_size - i);
1025 #endif
1026                 if (!cnt)
1027                         break;
1028                 if (cnt == -1)
1029                         goto err_out;
1030                 if (cnt < -1) {
1031                         ntfs_log_trace("Eeek. cnt = %i\n", cnt);
1032                         errno = EINVAL;
1033                         goto err_out;
1034                 }
1035                 /* Make sure we are not overflowing the NTFS Unicode set. */
1036                 if ((unsigned long)wc >= (unsigned long)(1 <<
1037                                 (8 * sizeof(ntfschar)))) {
1038                         errno = EILSEQ;
1039                         goto err_out;
1040                 }
1041                 /* Convert the CPU wide character to a LE Unicode character. */
1042                 ucs[o] = cpu_to_le16(wc);
1043         }
1044 #ifdef HAVE_MBSINIT
1045         /* Make sure we are back in the initial state. */
1046         if (!mbsinit(&mbstate)) {
1047                 ntfs_log_trace("Eeek. mbstate not in initial state!\n");
1048                 errno = EILSEQ;
1049                 goto err_out;
1050         }
1051 #endif
1052         /* Now write the NULL character. */
1053         ucs[o] = cpu_to_le16(L'\0');
1054         *outs = ucs;
1055         return o;
1056 err_out:
1057         free(ucs);
1058 #else /* MB_CUR_MAX */
1059         errno = EILSEQ;
1060 #endif /* MB_CUR_MAX */
1061         return -1;
1062 }
1063
1064 /*
1065  *              Turn a UTF8 name uppercase
1066  *
1067  *      Returns an allocated uppercase name which has to be freed by caller
1068  *      or NULL if there is an error (described by errno)
1069  */
1070
1071 char *ntfs_uppercase_mbs(const char *low,
1072                         const ntfschar *upcase, u32 upcase_size)
1073 {
1074         int size;
1075         char *upp;
1076         u32 wc;
1077         int n;
1078         const char *s;
1079         char *t;
1080
1081         size = strlen(low);
1082         upp = (char*)ntfs_malloc(3*size + 1);
1083         if (upp) {
1084                 s = low;
1085                 t = upp;
1086                 do {
1087                         n = utf8_to_unicode(&wc, s);
1088                         if (n > 0) {
1089                                 if (wc < upcase_size)
1090                                         wc = le16_to_cpu(upcase[wc]);
1091                                 if (wc < 0x80)
1092                                         *t++ = wc;
1093                                 else if (wc < 0x800) {
1094                                         *t++ = (0xc0 | ((wc >> 6) & 0x3f));
1095                                         *t++ = 0x80 | (wc & 0x3f);
1096                                 } else if (wc < 0x10000) {
1097                                         *t++ = 0xe0 | (wc >> 12);
1098                                         *t++ = 0x80 | ((wc >> 6) & 0x3f);
1099                                         *t++ = 0x80 | (wc & 0x3f);
1100                                 } else {
1101                                         *t++ = 0xf0 | ((wc >> 18) & 7);
1102                                         *t++ = 0x80 | ((wc >> 12) & 63);
1103                                         *t++ = 0x80 | ((wc >> 6) & 0x3f);
1104                                         *t++ = 0x80 | (wc & 0x3f);
1105                                 }
1106                         s += n;
1107                         }
1108                 } while (n > 0);
1109                 if (n < 0) {
1110                         free(upp);
1111                         upp = (char*)NULL;
1112                         errno = EILSEQ;
1113                 }
1114                 *t = 0;
1115         }
1116         return (upp);
1117 }
1118
1119 /**
1120  * ntfs_upcase_table_build - build the default upcase table for NTFS
1121  * @uc:         destination buffer where to store the built table
1122  * @uc_len:     size of destination buffer in bytes
1123  *
1124  * ntfs_upcase_table_build() builds the default upcase table for NTFS and
1125  * stores it in the caller supplied buffer @uc of size @uc_len.
1126  *
1127  * Note, @uc_len must be at least 128kiB in size or bad things will happen!
1128  */
1129 void ntfs_upcase_table_build(ntfschar *uc, u32 uc_len)
1130 {
1131 #if 1 /* Vista */
1132         /*
1133          *      This is the table as defined by Vista
1134          */
1135         /*
1136          * "Start" is inclusive and "End" is exclusive, every value has the
1137          * value of "Add" added to it.
1138          */
1139         static int uc_run_table[][3] = { /* Start, End, Add */
1140         {0x0061, 0x007b,   -32}, {0x00e0, 0x00f7,  -32}, {0x00f8, 0x00ff, -32},
1141         {0x0256, 0x0258,  -205}, {0x028a, 0x028c, -217}, {0x037b, 0x037e, 130},
1142         {0x03ac, 0x03ad,   -38}, {0x03ad, 0x03b0,  -37}, {0x03b1, 0x03c2, -32},
1143         {0x03c2, 0x03c3,   -31}, {0x03c3, 0x03cc,  -32}, {0x03cc, 0x03cd, -64},
1144         {0x03cd, 0x03cf,   -63}, {0x0430, 0x0450,  -32}, {0x0450, 0x0460, -80},
1145         {0x0561, 0x0587,   -48}, {0x1f00, 0x1f08,    8}, {0x1f10, 0x1f16,   8},
1146         {0x1f20, 0x1f28,     8}, {0x1f30, 0x1f38,    8}, {0x1f40, 0x1f46,   8},
1147         {0x1f51, 0x1f52,     8}, {0x1f53, 0x1f54,    8}, {0x1f55, 0x1f56,   8},
1148         {0x1f57, 0x1f58,     8}, {0x1f60, 0x1f68,    8}, {0x1f70, 0x1f72,  74},
1149         {0x1f72, 0x1f76,    86}, {0x1f76, 0x1f78,  100}, {0x1f78, 0x1f7a, 128},
1150         {0x1f7a, 0x1f7c,   112}, {0x1f7c, 0x1f7e,  126}, {0x1f80, 0x1f88,   8},
1151         {0x1f90, 0x1f98,     8}, {0x1fa0, 0x1fa8,    8}, {0x1fb0, 0x1fb2,   8},
1152         {0x1fb3, 0x1fb4,     9}, {0x1fcc, 0x1fcd,   -9}, {0x1fd0, 0x1fd2,   8},
1153         {0x1fe0, 0x1fe2,     8}, {0x1fe5, 0x1fe6,    7}, {0x1ffc, 0x1ffd,  -9},
1154         {0x2170, 0x2180,   -16}, {0x24d0, 0x24ea,  -26}, {0x2c30, 0x2c5f, -48},
1155         {0x2d00, 0x2d26, -7264}, {0xff41, 0xff5b,  -32}, {0}
1156         };
1157         /*
1158          * "Start" is exclusive and "End" is inclusive, every second value is
1159          * decremented by one.
1160          */
1161         static int uc_dup_table[][2] = { /* Start, End */
1162         {0x0100, 0x012f}, {0x0132, 0x0137}, {0x0139, 0x0149}, {0x014a, 0x0178},
1163         {0x0179, 0x017e}, {0x01a0, 0x01a6}, {0x01b3, 0x01b7}, {0x01cd, 0x01dd},
1164         {0x01de, 0x01ef}, {0x01f4, 0x01f5}, {0x01f8, 0x01f9}, {0x01fa, 0x0220},
1165         {0x0222, 0x0234}, {0x023b, 0x023c}, {0x0241, 0x0242}, {0x0246, 0x024f},
1166         {0x03d8, 0x03ef}, {0x03f7, 0x03f8}, {0x03fa, 0x03fb}, {0x0460, 0x0481},
1167         {0x048a, 0x04bf}, {0x04c1, 0x04c4}, {0x04c5, 0x04c8}, {0x04c9, 0x04ce},
1168         {0x04ec, 0x04ed}, {0x04d0, 0x04eb}, {0x04ee, 0x04f5}, {0x04f6, 0x0513},
1169         {0x1e00, 0x1e95}, {0x1ea0, 0x1ef9}, {0x2183, 0x2184}, {0x2c60, 0x2c61},
1170         {0x2c67, 0x2c6c}, {0x2c75, 0x2c76}, {0x2c80, 0x2ce3}, {0}
1171         };
1172         /*
1173          * Set the Unicode character at offset "Offset" to "Value".  Note,
1174          * "Value" is host endian.
1175          */
1176         static int uc_byte_table[][2] = { /* Offset, Value */
1177         {0x00ff, 0x0178}, {0x0180, 0x0243}, {0x0183, 0x0182}, {0x0185, 0x0184},
1178         {0x0188, 0x0187}, {0x018c, 0x018b}, {0x0192, 0x0191}, {0x0195, 0x01f6},
1179         {0x0199, 0x0198}, {0x019a, 0x023d}, {0x019e, 0x0220}, {0x01a8, 0x01a7},
1180         {0x01ad, 0x01ac}, {0x01b0, 0x01af}, {0x01b9, 0x01b8}, {0x01bd, 0x01bc},
1181         {0x01bf, 0x01f7}, {0x01c6, 0x01c4}, {0x01c9, 0x01c7}, {0x01cc, 0x01ca},
1182         {0x01dd, 0x018e}, {0x01f3, 0x01f1}, {0x023a, 0x2c65}, {0x023e, 0x2c66},
1183         {0x0253, 0x0181}, {0x0254, 0x0186}, {0x0259, 0x018f}, {0x025b, 0x0190},
1184         {0x0260, 0x0193}, {0x0263, 0x0194}, {0x0268, 0x0197}, {0x0269, 0x0196},
1185         {0x026b, 0x2c62}, {0x026f, 0x019c}, {0x0272, 0x019d}, {0x0275, 0x019f},
1186         {0x027d, 0x2c64}, {0x0280, 0x01a6}, {0x0283, 0x01a9}, {0x0288, 0x01ae},
1187         {0x0289, 0x0244}, {0x028c, 0x0245}, {0x0292, 0x01b7}, {0x03f2, 0x03f9},
1188         {0x04cf, 0x04c0}, {0x1d7d, 0x2c63}, {0x214e, 0x2132}, {0}
1189         };
1190 #else /* Vista */
1191         /*
1192          *      This is the table as defined by Windows XP
1193          */
1194         static int uc_run_table[][3] = { /* Start, End, Add */
1195         {0x0061, 0x007B,  -32}, {0x0451, 0x045D, -80}, {0x1F70, 0x1F72,  74},
1196         {0x00E0, 0x00F7,  -32}, {0x045E, 0x0460, -80}, {0x1F72, 0x1F76,  86},
1197         {0x00F8, 0x00FF,  -32}, {0x0561, 0x0587, -48}, {0x1F76, 0x1F78, 100},
1198         {0x0256, 0x0258, -205}, {0x1F00, 0x1F08,   8}, {0x1F78, 0x1F7A, 128},
1199         {0x028A, 0x028C, -217}, {0x1F10, 0x1F16,   8}, {0x1F7A, 0x1F7C, 112},
1200         {0x03AC, 0x03AD,  -38}, {0x1F20, 0x1F28,   8}, {0x1F7C, 0x1F7E, 126},
1201         {0x03AD, 0x03B0,  -37}, {0x1F30, 0x1F38,   8}, {0x1FB0, 0x1FB2,   8},
1202         {0x03B1, 0x03C2,  -32}, {0x1F40, 0x1F46,   8}, {0x1FD0, 0x1FD2,   8},
1203         {0x03C2, 0x03C3,  -31}, {0x1F51, 0x1F52,   8}, {0x1FE0, 0x1FE2,   8},
1204         {0x03C3, 0x03CC,  -32}, {0x1F53, 0x1F54,   8}, {0x1FE5, 0x1FE6,   7},
1205         {0x03CC, 0x03CD,  -64}, {0x1F55, 0x1F56,   8}, {0x2170, 0x2180, -16},
1206         {0x03CD, 0x03CF,  -63}, {0x1F57, 0x1F58,   8}, {0x24D0, 0x24EA, -26},
1207         {0x0430, 0x0450,  -32}, {0x1F60, 0x1F68,   8}, {0xFF41, 0xFF5B, -32},
1208         {0}
1209         };
1210         static int uc_dup_table[][2] = { /* Start, End */
1211         {0x0100, 0x012F}, {0x01A0, 0x01A6}, {0x03E2, 0x03EF}, {0x04CB, 0x04CC},
1212         {0x0132, 0x0137}, {0x01B3, 0x01B7}, {0x0460, 0x0481}, {0x04D0, 0x04EB},
1213         {0x0139, 0x0149}, {0x01CD, 0x01DD}, {0x0490, 0x04BF}, {0x04EE, 0x04F5},
1214         {0x014A, 0x0178}, {0x01DE, 0x01EF}, {0x04BF, 0x04BF}, {0x04F8, 0x04F9},
1215         {0x0179, 0x017E}, {0x01F4, 0x01F5}, {0x04C1, 0x04C4}, {0x1E00, 0x1E95},
1216         {0x018B, 0x018B}, {0x01FA, 0x0218}, {0x04C7, 0x04C8}, {0x1EA0, 0x1EF9},
1217         {0}
1218         };
1219         static int uc_byte_table[][2] = { /* Offset, Value */
1220         {0x00FF, 0x0178}, {0x01AD, 0x01AC}, {0x01F3, 0x01F1}, {0x0269, 0x0196},
1221         {0x0183, 0x0182}, {0x01B0, 0x01AF}, {0x0253, 0x0181}, {0x026F, 0x019C},
1222         {0x0185, 0x0184}, {0x01B9, 0x01B8}, {0x0254, 0x0186}, {0x0272, 0x019D},
1223         {0x0188, 0x0187}, {0x01BD, 0x01BC}, {0x0259, 0x018F}, {0x0275, 0x019F},
1224         {0x018C, 0x018B}, {0x01C6, 0x01C4}, {0x025B, 0x0190}, {0x0283, 0x01A9},
1225         {0x0192, 0x0191}, {0x01C9, 0x01C7}, {0x0260, 0x0193}, {0x0288, 0x01AE},
1226         {0x0199, 0x0198}, {0x01CC, 0x01CA}, {0x0263, 0x0194}, {0x0292, 0x01B7},
1227         {0x01A8, 0x01A7}, {0x01DD, 0x018E}, {0x0268, 0x0197},
1228         {0}
1229         };
1230 #endif /* Vista */
1231         int i, r;
1232         int k, off;
1233
1234         memset((char*)uc, 0, uc_len);
1235         uc_len >>= 1;
1236         if (uc_len > 65536)
1237                 uc_len = 65536;
1238         for (i = 0; (u32)i < uc_len; i++)
1239                 uc[i] = cpu_to_le16(i);
1240         for (r = 0; uc_run_table[r][0]; r++) {
1241                 off = uc_run_table[r][2];
1242                 for (i = uc_run_table[r][0]; i < uc_run_table[r][1]; i++)
1243                         uc[i] = cpu_to_le16(i + off);
1244         }
1245         for (r = 0; uc_dup_table[r][0]; r++)
1246                 for (i = uc_dup_table[r][0]; i < uc_dup_table[r][1]; i += 2)
1247                         uc[i + 1] = cpu_to_le16(i);
1248         for (r = 0; uc_byte_table[r][0]; r++) {
1249                 k = uc_byte_table[r][1];
1250                 uc[uc_byte_table[r][0]] = cpu_to_le16(k);
1251         }
1252 }
1253
1254 /*
1255  *              Allocate and build the default upcase table
1256  *
1257  *      Returns the number of entries
1258  *              0 if failed
1259  */
1260
1261 #define UPCASE_LEN 65536 /* default number of entries in upcase */
1262
1263 u32 ntfs_upcase_build_default(ntfschar **upcase)
1264 {
1265         u32 upcase_len = 0;
1266
1267         *upcase = (ntfschar*)ntfs_malloc(UPCASE_LEN*2);
1268         if (*upcase) {
1269                 ntfs_upcase_table_build(*upcase, UPCASE_LEN*2);
1270                 upcase_len = UPCASE_LEN;
1271         }
1272         return (upcase_len);
1273 }
1274
1275 /*
1276  *              Build a table for converting to lower case
1277  *
1278  *      This is only meaningful when there is a single lower case
1279  *      character leading to an upper case one, and currently the
1280  *      only exception is the greek letter sigma which has a single
1281  *      upper case glyph (code U+03A3), but two lower case glyphs
1282  *      (code U+03C3 and U+03C2, the latter to be used at the end
1283  *      of a word). In the following implementation the upper case
1284  *      sigma will be lowercased as U+03C3.
1285  */
1286
1287 ntfschar *ntfs_locase_table_build(const ntfschar *uc, u32 uc_cnt)
1288 {
1289         ntfschar *lc;
1290         u32 upp;
1291         u32 i;
1292
1293         lc = (ntfschar*)ntfs_malloc(uc_cnt*sizeof(ntfschar));
1294         if (lc) {
1295                 for (i=0; i<uc_cnt; i++)
1296                         lc[i] = cpu_to_le16(i);
1297                 for (i=0; i<uc_cnt; i++) {
1298                         upp = le16_to_cpu(uc[i]);
1299                         if ((upp != i) && (upp < uc_cnt))
1300                                 lc[upp] = cpu_to_le16(i);
1301                 }
1302         } else
1303                 ntfs_log_error("Could not build the locase table\n");
1304         return (lc);
1305 }
1306
1307 /**
1308  * ntfs_str2ucs - convert a string to a valid NTFS file name
1309  * @s:          input string
1310  * @len:        length of output buffer in Unicode characters
1311  *
1312  * Convert the input @s string into the corresponding little endian,
1313  * 2-byte Unicode string. The length of the converted string is less
1314  * or equal to the maximum length allowed by the NTFS format (255).
1315  *
1316  * If @s is NULL then return AT_UNNAMED.
1317  *
1318  * On success the function returns the Unicode string in an allocated
1319  * buffer and the caller is responsible to free it when it's not needed
1320  * anymore.
1321  *
1322  * On error NULL is returned and errno is set to the error code.
1323  */
1324 ntfschar *ntfs_str2ucs(const char *s, int *len)
1325 {
1326         ntfschar *ucs = NULL;
1327
1328         if (s && ((*len = ntfs_mbstoucs(s, &ucs)) == -1)) {
1329                 ntfs_log_perror("Couldn't convert '%s' to Unicode", s);
1330                 return NULL;
1331         }
1332         if (*len > NTFS_MAX_NAME_LEN) {
1333                 free(ucs);
1334                 errno = ENAMETOOLONG;
1335                 return NULL;
1336         }
1337         if (!ucs || !*len) {
1338                 ucs  = AT_UNNAMED;
1339                 *len = 0;
1340         }
1341         return ucs;
1342 }
1343
1344 /**
1345  * ntfs_ucsfree - free memory allocated by ntfs_str2ucs()
1346  * @ucs         input string to be freed
1347  *
1348  * Free memory at @ucs and which was allocated by ntfs_str2ucs.
1349  *
1350  * Return value: none.
1351  */
1352 void ntfs_ucsfree(ntfschar *ucs)
1353 {
1354         if (ucs && (ucs != AT_UNNAMED))
1355                 free(ucs);
1356 }
1357
1358 /*
1359  *              Check whether a name contains no chars forbidden
1360  *      for DOS or Win32 use
1361  *
1362  *      If there is a bad char, errno is set to EINVAL
1363  */
1364
1365 BOOL ntfs_forbidden_chars(const ntfschar *name, int len)
1366 {
1367         BOOL forbidden;
1368         int ch;
1369         int i;
1370         u32 mainset =     (1L << ('\"' - 0x20))
1371                         | (1L << ('*' - 0x20))
1372                         | (1L << ('/' - 0x20))
1373                         | (1L << (':' - 0x20))
1374                         | (1L << ('<' - 0x20))
1375                         | (1L << ('>' - 0x20))
1376                         | (1L << ('?' - 0x20));
1377
1378         forbidden = (len == 0)
1379                         || (le16_to_cpu(name[len-1]) == ' ')
1380                         || (le16_to_cpu(name[len-1]) == '.');
1381         for (i=0; i<len; i++) {
1382                 ch = le16_to_cpu(name[i]);
1383                 if ((ch < 0x20)
1384                     || ((ch < 0x40)
1385                         && ((1L << (ch - 0x20)) & mainset))
1386                     || (ch == '\\')
1387                     || (ch == '|'))
1388                         forbidden = TRUE;
1389         }
1390         if (forbidden)
1391                 errno = EINVAL;
1392         return (forbidden);
1393 }
1394
1395 /*
1396  *              Check whether the same name can be used as a DOS and
1397  *      a Win32 name
1398  *
1399  *      The names must be the same, or the short name the uppercase
1400  *      variant of the long name
1401  */
1402
1403 BOOL ntfs_collapsible_chars(ntfs_volume *vol,
1404                         const ntfschar *shortname, int shortlen,
1405                         const ntfschar *longname, int longlen)
1406 {
1407         BOOL collapsible;
1408         unsigned int ch;
1409         unsigned int cs;
1410         int i;
1411
1412         collapsible = shortlen == longlen;
1413         for (i=0; collapsible && (i<shortlen); i++) {
1414                 ch = le16_to_cpu(longname[i]);
1415                 cs = le16_to_cpu(shortname[i]);
1416                 if ((cs != ch)
1417                     && ((ch >= vol->upcase_len)
1418                         || (cs >= vol->upcase_len)
1419                         || (vol->upcase[cs] != vol->upcase[ch])))
1420                                 collapsible = FALSE;
1421         }
1422         return (collapsible);
1423 }
1424
1425 /*
1426  * Define the character encoding to be used.
1427  * Use UTF-8 unless specified otherwise.
1428  */
1429
1430 int ntfs_set_char_encoding(const char *locale)
1431 {
1432         use_utf8 = 0;
1433         if (!locale || strstr(locale,"utf8") || strstr(locale,"UTF8")
1434             || strstr(locale,"utf-8") || strstr(locale,"UTF-8"))
1435                 use_utf8 = 1;
1436         else
1437                 if (setlocale(LC_ALL, locale))
1438                         use_utf8 = 0;
1439                 else {
1440                         ntfs_log_error("Invalid locale, encoding to UTF-8\n");
1441                         use_utf8 = 1;
1442                 }
1443         return 0; /* always successful */
1444 }
1445
1446 #if defined(__APPLE__) || defined(__DARWIN__)
1447
1448 int ntfs_macosx_normalize_filenames(int normalize) {
1449 #ifdef ENABLE_NFCONV
1450         if(normalize == 0 || normalize == 1) {
1451                 nfconvert_utf8 = normalize;
1452                 return 0;
1453         }
1454         else
1455                 return -1;
1456 #else
1457         return -1;
1458 #endif /* ENABLE_NFCONV */
1459 }
1460
1461 int ntfs_macosx_normalize_utf8(const char *utf8_string, char **target,
1462  int composed) {
1463 #ifdef ENABLE_NFCONV
1464         /* For this code to compile, the CoreFoundation framework must be fed to the linker. */
1465         CFStringRef cfSourceString;
1466         CFMutableStringRef cfMutableString;
1467         CFRange rangeToProcess;
1468         CFIndex requiredBufferLength;
1469         char *result = NULL;
1470         int resultLength = -1;
1471
1472         /* Convert the UTF-8 string to a CFString. */
1473         cfSourceString = CFStringCreateWithCString(kCFAllocatorDefault, utf8_string, kCFStringEncodingUTF8);
1474         if(cfSourceString == NULL) {
1475                 ntfs_log_error("CFStringCreateWithCString failed!\n");
1476                 return -2;
1477         }
1478
1479         /* Create a mutable string from cfSourceString that we are free to modify. */
1480         cfMutableString = CFStringCreateMutableCopy(kCFAllocatorDefault, 0, cfSourceString);
1481         CFRelease(cfSourceString); /* End-of-life. */
1482         if(cfMutableString == NULL) {
1483                 ntfs_log_error("CFStringCreateMutableCopy failed!\n");
1484                 return -3;
1485         }
1486
1487         /* Normalize the mutable string to the desired normalization form. */
1488         CFStringNormalize(cfMutableString, (composed != 0 ? kCFStringNormalizationFormC : kCFStringNormalizationFormD));
1489
1490         /* Store the resulting string in a '\0'-terminated UTF-8 encoded char* buffer. */
1491         rangeToProcess = CFRangeMake(0, CFStringGetLength(cfMutableString));
1492         if(CFStringGetBytes(cfMutableString, rangeToProcess, kCFStringEncodingUTF8, 0, false, NULL, 0, &requiredBufferLength) > 0) {
1493                 resultLength = sizeof(char)*(requiredBufferLength + 1);
1494                 result = ntfs_calloc(resultLength);
1495
1496                 if(result != NULL) {
1497                         if(CFStringGetBytes(cfMutableString, rangeToProcess, kCFStringEncodingUTF8,
1498                                             0, false, (UInt8*)result, resultLength-1, &requiredBufferLength) <= 0) {
1499                                 ntfs_log_error("Could not perform UTF-8 conversion of normalized CFMutableString.\n");
1500                                 free(result);
1501                                 result = NULL;
1502                         }
1503                 }
1504                 else
1505                         ntfs_log_error("Could not perform a ntfs_calloc of %d bytes for char *result.\n", resultLength);
1506         }
1507         else
1508                 ntfs_log_error("Could not perform check for required length of UTF-8 conversion of normalized CFMutableString.\n");
1509
1510
1511         CFRelease(cfMutableString);
1512
1513         if(result != NULL) {
1514                 *target = result;
1515                 return resultLength - 1;
1516         }
1517         else
1518                 return -1;
1519 #else
1520         return -1;
1521 #endif /* ENABLE_NFCONV */
1522 }
1523 #endif /* defined(__APPLE__) || defined(__DARWIN__) */