base/i18n/file_util_icu.cc

   1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 // File utilities that use the ICU library go in this file.
   6
   7 #include "base/i18n/file_util_icu.h"
   8
   9 #include "base/files/file_path.h"
  10 #include "base/i18n/icu_string_conversions.h"
  11 #include "base/i18n/string_compare.h"
  12 #include "base/logging.h"
  13 #include "base/memory/scoped_ptr.h"
  14 #include "base/memory/singleton.h"
  15 #include "base/strings/string_util.h"
  16 #include "base/strings/sys_string_conversions.h"
  17 #include "base/strings/utf_string_conversions.h"
  18 #include "build/build_config.h"
  19 #include "third_party/icu/source/common/unicode/uniset.h"
  20 #include "third_party/icu/source/i18n/unicode/coll.h"
  21
  22 namespace base {
  23 namespace i18n {
  24
  25 namespace {
  26
  27 class IllegalCharacters {
  28  public:
  29   static IllegalCharacters* GetInstance() {
  30     return Singleton<IllegalCharacters>::get();
  31   }
  32
  33   bool DisallowedEverywhere(UChar32 ucs4) {
  34     return !!illegal_anywhere_->contains(ucs4);
  35   }
  36
  37   bool DisallowedLeadingOrTrailing(UChar32 ucs4) {
  38     return !!illegal_at_ends_->contains(ucs4);
  39   }
  40
  41   bool IsAllowedName(const string16& s) {
  42     return s.empty() || (!!illegal_anywhere_->containsNone(
  43                              icu::UnicodeString(s.c_str(), s.size())) &&
  44                          !illegal_at_ends_->contains(*s.begin()) &&
  45                          !illegal_at_ends_->contains(*s.rbegin()));
  46   }
  47
  48  private:
  49   friend class Singleton<IllegalCharacters>;
  50   friend struct DefaultSingletonTraits<IllegalCharacters>;
  51
  52   IllegalCharacters();
  53   ~IllegalCharacters() { }
  54
  55   // set of characters considered invalid anywhere inside a filename.
  56   scoped_ptr<icu::UnicodeSet> illegal_anywhere_;
  57
  58   // set of characters considered invalid at either end of a filename.
  59   scoped_ptr<icu::UnicodeSet> illegal_at_ends_;
  60
  61   DISALLOW_COPY_AND_ASSIGN(IllegalCharacters);
  62 };
  63
  64 IllegalCharacters::IllegalCharacters() {
  65   UErrorCode everywhere_status = U_ZERO_ERROR;
  66   UErrorCode ends_status = U_ZERO_ERROR;
  67   // Control characters, formatting characters, non-characters, path separators,
  68   // and some printable ASCII characters regarded as dangerous ('"*/:<>?\\').
  69   // See  http://blogs.msdn.com/michkap/archive/2006/11/03/941420.aspx
  70   // and http://msdn2.microsoft.com/en-us/library/Aa365247.aspx
  71   // Note that code points in the "Other, Format" (Cf) category are ignored on
  72   // HFS+ despite the ZERO_WIDTH_JOINER and ZERO_WIDTH_NON-JOINER being
  73   // legitimate in Arabic and some S/SE Asian scripts. In addition tilde (~) is
  74   // also excluded due to the possibility of interacting poorly with short
  75   // filenames on VFAT. (Related to CVE-2014-9390)
  76   illegal_anywhere_.reset(new icu::UnicodeSet(
  77       UNICODE_STRING_SIMPLE("[[\"~*/:<>?\\\\|][:Cc:][:Cf:]]"),
  78       everywhere_status));
  79   illegal_at_ends_.reset(new icu::UnicodeSet(
  80       UNICODE_STRING_SIMPLE("[[:WSpace:][.]]"), ends_status));
  81   DCHECK(U_SUCCESS(everywhere_status));
  82   DCHECK(U_SUCCESS(ends_status));
  83
  84   // Add non-characters. If this becomes a performance bottleneck by
  85   // any chance, do not add these to |set| and change IsFilenameLegal()
  86   // to check |ucs4 & 0xFFFEu == 0xFFFEu|, in addiition to calling
  87   // IsAllowedName().
  88   illegal_anywhere_->add(0xFDD0, 0xFDEF);
  89   for (int i = 0; i <= 0x10; ++i) {
  90     int plane_base = 0x10000 * i;
  91     illegal_anywhere_->add(plane_base + 0xFFFE, plane_base + 0xFFFF);
  92   }
  93   illegal_anywhere_->freeze();
  94   illegal_at_ends_->freeze();
  95 }
  96
  97 }  // namespace
  98
  99 bool IsFilenameLegal(const string16& file_name) {
 100   return IllegalCharacters::GetInstance()->IsAllowedName(file_name);
 101 }
 102
 103 void ReplaceIllegalCharactersInPath(FilePath::StringType* file_name,
 104                                     char replace_char) {
 105   IllegalCharacters* illegal = IllegalCharacters::GetInstance();
 106
 107   DCHECK(!(illegal->DisallowedEverywhere(replace_char)));
 108   DCHECK(!(illegal->DisallowedLeadingOrTrailing(replace_char)));
 109
 110   int cursor = 0;  // The ICU macros expect an int.
 111   while (cursor < static_cast<int>(file_name->size())) {
 112     int char_begin = cursor;
 113     uint32 code_point;
 114 #if defined(OS_MACOSX)
 115     // Mac uses UTF-8 encoding for filenames.
 116     U8_NEXT(file_name->data(), cursor, static_cast<int>(file_name->length()),
 117             code_point);
 118 #elif defined(OS_WIN)
 119     // Windows uses UTF-16 encoding for filenames.
 120     U16_NEXT(file_name->data(), cursor, static_cast<int>(file_name->length()),
 121              code_point);
 122 #elif defined(OS_POSIX)
 123     // Linux doesn't actually define an encoding. It basically allows anything
 124     // except for a few special ASCII characters.
 125     unsigned char cur_char = static_cast<unsigned char>((*file_name)[cursor++]);
 126     if (cur_char >= 0x80)
 127       continue;
 128     code_point = cur_char;
 129 #else
 130     NOTREACHED();
 131 #endif
 132
 133     if (illegal->DisallowedEverywhere(code_point) ||
 134         ((char_begin == 0 || cursor == static_cast<int>(file_name->length())) &&
 135          illegal->DisallowedLeadingOrTrailing(code_point))) {
 136       file_name->replace(char_begin, cursor - char_begin, 1, replace_char);
 137       // We just made the potentially multi-byte/word char into one that only
 138       // takes one byte/word, so need to adjust the cursor to point to the next
 139       // character again.
 140       cursor = char_begin + 1;
 141     }
 142   }
 143 }
 144
 145 bool LocaleAwareCompareFilenames(const FilePath& a, const FilePath& b) {
 146   UErrorCode error_code = U_ZERO_ERROR;
 147   // Use the default collator. The default locale should have been properly
 148   // set by the time this constructor is called.
 149   scoped_ptr<icu::Collator> collator(icu::Collator::createInstance(error_code));
 150   DCHECK(U_SUCCESS(error_code));
 151   // Make it case-sensitive.
 152   collator->setStrength(icu::Collator::TERTIARY);
 153
 154 #if defined(OS_WIN)
 155   return CompareString16WithCollator(collator.get(),
 156       WideToUTF16(a.value()), WideToUTF16(b.value())) == UCOL_LESS;
 157
 158 #elif defined(OS_POSIX)
 159   // On linux, the file system encoding is not defined. We assume
 160   // SysNativeMBToWide takes care of it.
 161   return CompareString16WithCollator(
 162       collator.get(),
 163       WideToUTF16(SysNativeMBToWide(a.value().c_str())),
 164       WideToUTF16(SysNativeMBToWide(b.value().c_str()))) == UCOL_LESS;
 165 #else
 166   #error Not implemented on your system
 167 #endif
 168 }
 169
 170 void NormalizeFileNameEncoding(FilePath* file_name) {
 171 #if defined(OS_CHROMEOS)
 172   std::string normalized_str;
 173   if (ConvertToUtf8AndNormalize(file_name->BaseName().value(),
 174                                 kCodepageUTF8,
 175                                 &normalized_str)) {
 176     *file_name = file_name->DirName().Append(FilePath(normalized_str));
 177   }
 178 #endif
 179 }
 180
 181 }  // namespace i18n
 182 }  // namespace base