base/string_util.cc

   1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #include "base/string_util.h"
   6
   7 #include "build/build_config.h"
   8
   9 #include <ctype.h>
  10 #include <errno.h>
  11 #include <math.h>
  12 #include <stdarg.h>
  13 #include <stdio.h>
  14 #include <stdlib.h>
  15 #include <string.h>
  16 #include <time.h>
  17 #include <wchar.h>
  18 #include <wctype.h>
  19
  20 #include <algorithm>
  21 #include <vector>
  22
  23 #include "base/basictypes.h"
  24 #include "base/logging.h"
  25 #include "base/memory/singleton.h"
  26 #include "base/third_party/dmg_fp/dmg_fp.h"
  27 #include "base/utf_string_conversion_utils.h"
  28 #include "base/utf_string_conversions.h"
  29 #include "base/third_party/icu/icu_utf.h"
  30
  31 namespace {
  32
  33 // Force the singleton used by Empty[W]String[16] to be a unique type. This
  34 // prevents other code that might accidentally use Singleton<string> from
  35 // getting our internal one.
  36 struct EmptyStrings {
  37   EmptyStrings() {}
  38   const std::string s;
  39   const std::wstring ws;
  40   const string16 s16;
  41
  42   static EmptyStrings* GetInstance() {
  43     return Singleton<EmptyStrings>::get();
  44   }
  45 };
  46
  47 // Used by ReplaceStringPlaceholders to track the position in the string of
  48 // replaced parameters.
  49 struct ReplacementOffset {
  50   ReplacementOffset(uintptr_t parameter, size_t offset)
  51       : parameter(parameter),
  52         offset(offset) {}
  53
  54   // Index of the parameter.
  55   uintptr_t parameter;
  56
  57   // Starting position in the string.
  58   size_t offset;
  59 };
  60
  61 static bool CompareParameter(const ReplacementOffset& elem1,
  62                              const ReplacementOffset& elem2) {
  63   return elem1.parameter < elem2.parameter;
  64 }
  65
  66 }  // namespace
  67
  68 namespace base {
  69
  70 bool IsWprintfFormatPortable(const wchar_t* format) {
  71   for (const wchar_t* position = format; *position != '\0'; ++position) {
  72     if (*position == '%') {
  73       bool in_specification = true;
  74       bool modifier_l = false;
  75       while (in_specification) {
  76         // Eat up characters until reaching a known specifier.
  77         if (*++position == '\0') {
  78           // The format string ended in the middle of a specification.  Call
  79           // it portable because no unportable specifications were found.  The
  80           // string is equally broken on all platforms.
  81           return true;
  82         }
  83
  84         if (*position == 'l') {
  85           // 'l' is the only thing that can save the 's' and 'c' specifiers.
  86           modifier_l = true;
  87         } else if (((*position == 's' || *position == 'c') && !modifier_l) ||
  88                    *position == 'S' || *position == 'C' || *position == 'F' ||
  89                    *position == 'D' || *position == 'O' || *position == 'U') {
  90           // Not portable.
  91           return false;
  92         }
  93
  94         if (wcschr(L"diouxXeEfgGaAcspn%", *position)) {
  95           // Portable, keep scanning the rest of the format string.
  96           in_specification = false;
  97         }
  98       }
  99     }
 100   }
 101
 102   return true;
 103 }
 104
 105 }  // namespace base
 106
 107
 108 const std::string& EmptyString() {
 109   return EmptyStrings::GetInstance()->s;
 110 }
 111
 112 const std::wstring& EmptyWString() {
 113   return EmptyStrings::GetInstance()->ws;
 114 }
 115
 116 const string16& EmptyString16() {
 117   return EmptyStrings::GetInstance()->s16;
 118 }
 119
 120 #define WHITESPACE_UNICODE \
 121   0x0009, /* <control-0009> to <control-000D> */ \
 122   0x000A,                                        \
 123   0x000B,                                        \
 124   0x000C,                                        \
 125   0x000D,                                        \
 126   0x0020, /* Space */                            \
 127   0x0085, /* <control-0085> */                   \
 128   0x00A0, /* No-Break Space */                   \
 129   0x1680, /* Ogham Space Mark */                 \
 130   0x180E, /* Mongolian Vowel Separator */        \
 131   0x2000, /* En Quad to Hair Space */            \
 132   0x2001,                                        \
 133   0x2002,                                        \
 134   0x2003,                                        \
 135   0x2004,                                        \
 136   0x2005,                                        \
 137   0x2006,                                        \
 138   0x2007,                                        \
 139   0x2008,                                        \
 140   0x2009,                                        \
 141   0x200A,                                        \
 142   0x200C, /* Zero Width Non-Joiner */            \
 143   0x2028, /* Line Separator */                   \
 144   0x2029, /* Paragraph Separator */              \
 145   0x202F, /* Narrow No-Break Space */            \
 146   0x205F, /* Medium Mathematical Space */        \
 147   0x3000, /* Ideographic Space */                \
 148   0
 149
 150 const wchar_t kWhitespaceWide[] = {
 151   WHITESPACE_UNICODE
 152 };
 153 const char16 kWhitespaceUTF16[] = {
 154   WHITESPACE_UNICODE
 155 };
 156 const char kWhitespaceASCII[] = {
 157   0x09,    // <control-0009> to <control-000D>
 158   0x0A,
 159   0x0B,
 160   0x0C,
 161   0x0D,
 162   0x20,    // Space
 163   0
 164 };
 165
 166 const char kUtf8ByteOrderMark[] = "\xEF\xBB\xBF";
 167
 168 template<typename STR>
 169 bool RemoveCharsT(const STR& input,
 170                   const typename STR::value_type remove_chars[],
 171                   STR* output) {
 172   bool removed = false;
 173   size_t found;
 174
 175   *output = input;
 176
 177   found = output->find_first_of(remove_chars);
 178   while (found != STR::npos) {
 179     removed = true;
 180     output->replace(found, 1, STR());
 181     found = output->find_first_of(remove_chars, found);
 182   }
 183
 184   return removed;
 185 }
 186
 187 bool RemoveChars(const std::wstring& input,
 188                  const wchar_t remove_chars[],
 189                  std::wstring* output) {
 190   return RemoveCharsT(input, remove_chars, output);
 191 }
 192
 193 #if !defined(WCHAR_T_IS_UTF16)
 194 bool RemoveChars(const string16& input,
 195                  const char16 remove_chars[],
 196                  string16* output) {
 197   return RemoveCharsT(input, remove_chars, output);
 198 }
 199 #endif
 200
 201 bool RemoveChars(const std::string& input,
 202                  const char remove_chars[],
 203                  std::string* output) {
 204   return RemoveCharsT(input, remove_chars, output);
 205 }
 206
 207 template<typename STR>
 208 TrimPositions TrimStringT(const STR& input,
 209                           const typename STR::value_type trim_chars[],
 210                           TrimPositions positions,
 211                           STR* output) {
 212   // Find the edges of leading/trailing whitespace as desired.
 213   const typename STR::size_type last_char = input.length() - 1;
 214   const typename STR::size_type first_good_char = (positions & TRIM_LEADING) ?
 215       input.find_first_not_of(trim_chars) : 0;
 216   const typename STR::size_type last_good_char = (positions & TRIM_TRAILING) ?
 217       input.find_last_not_of(trim_chars) : last_char;
 218
 219   // When the string was all whitespace, report that we stripped off whitespace
 220   // from whichever position the caller was interested in.  For empty input, we
 221   // stripped no whitespace, but we still need to clear |output|.
 222   if (input.empty() ||
 223       (first_good_char == STR::npos) || (last_good_char == STR::npos)) {
 224     bool input_was_empty = input.empty();  // in case output == &input
 225     output->clear();
 226     return input_was_empty ? TRIM_NONE : positions;
 227   }
 228
 229   // Trim the whitespace.
 230   *output =
 231       input.substr(first_good_char, last_good_char - first_good_char + 1);
 232
 233   // Return where we trimmed from.
 234   return static_cast<TrimPositions>(
 235       ((first_good_char == 0) ? TRIM_NONE : TRIM_LEADING) |
 236       ((last_good_char == last_char) ? TRIM_NONE : TRIM_TRAILING));
 237 }
 238
 239 bool TrimString(const std::wstring& input,
 240                 const wchar_t trim_chars[],
 241                 std::wstring* output) {
 242   return TrimStringT(input, trim_chars, TRIM_ALL, output) != TRIM_NONE;
 243 }
 244
 245 #if !defined(WCHAR_T_IS_UTF16)
 246 bool TrimString(const string16& input,
 247                 const char16 trim_chars[],
 248                 string16* output) {
 249   return TrimStringT(input, trim_chars, TRIM_ALL, output) != TRIM_NONE;
 250 }
 251 #endif
 252
 253 bool TrimString(const std::string& input,
 254                 const char trim_chars[],
 255                 std::string* output) {
 256   return TrimStringT(input, trim_chars, TRIM_ALL, output) != TRIM_NONE;
 257 }
 258
 259 void TruncateUTF8ToByteSize(const std::string& input,
 260                             const size_t byte_size,
 261                             std::string* output) {
 262   DCHECK(output);
 263   if (byte_size > input.length()) {
 264     *output = input;
 265     return;
 266   }
 267   DCHECK_LE(byte_size, static_cast<uint32>(kint32max));
 268   // Note: This cast is necessary because CBU8_NEXT uses int32s.
 269   int32 truncation_length = static_cast<int32>(byte_size);
 270   int32 char_index = truncation_length - 1;
 271   const char* data = input.data();
 272
 273   // Using CBU8, we will move backwards from the truncation point
 274   // to the beginning of the string looking for a valid UTF8
 275   // character.  Once a full UTF8 character is found, we will
 276   // truncate the string to the end of that character.
 277   while (char_index >= 0) {
 278     int32 prev = char_index;
 279     uint32 code_point = 0;
 280     CBU8_NEXT(data, char_index, truncation_length, code_point);
 281     if (!base::IsValidCharacter(code_point) ||
 282         !base::IsValidCodepoint(code_point)) {
 283       char_index = prev - 1;
 284     } else {
 285       break;
 286     }
 287   }
 288
 289   if (char_index >= 0 )
 290     *output = input.substr(0, char_index);
 291   else
 292     output->clear();
 293 }
 294
 295 TrimPositions TrimWhitespace(const std::wstring& input,
 296                              TrimPositions positions,
 297                              std::wstring* output) {
 298   return TrimStringT(input, kWhitespaceWide, positions, output);
 299 }
 300
 301 #if !defined(WCHAR_T_IS_UTF16)
 302 TrimPositions TrimWhitespace(const string16& input,
 303                              TrimPositions positions,
 304                              string16* output) {
 305   return TrimStringT(input, kWhitespaceUTF16, positions, output);
 306 }
 307 #endif
 308
 309 TrimPositions TrimWhitespaceASCII(const std::string& input,
 310                                   TrimPositions positions,
 311                                   std::string* output) {
 312   return TrimStringT(input, kWhitespaceASCII, positions, output);
 313 }
 314
 315 // This function is only for backward-compatibility.
 316 // To be removed when all callers are updated.
 317 TrimPositions TrimWhitespace(const std::string& input,
 318                              TrimPositions positions,
 319                              std::string* output) {
 320   return TrimWhitespaceASCII(input, positions, output);
 321 }
 322
 323 template<typename STR>
 324 STR CollapseWhitespaceT(const STR& text,
 325                         bool trim_sequences_with_line_breaks) {
 326   STR result;
 327   result.resize(text.size());
 328
 329   // Set flags to pretend we're already in a trimmed whitespace sequence, so we
 330   // will trim any leading whitespace.
 331   bool in_whitespace = true;
 332   bool already_trimmed = true;
 333
 334   int chars_written = 0;
 335   for (typename STR::const_iterator i(text.begin()); i != text.end(); ++i) {
 336     if (IsWhitespace(*i)) {
 337       if (!in_whitespace) {
 338         // Reduce all whitespace sequences to a single space.
 339         in_whitespace = true;
 340         result[chars_written++] = L' ';
 341       }
 342       if (trim_sequences_with_line_breaks && !already_trimmed &&
 343           ((*i == '\n') || (*i == '\r'))) {
 344         // Whitespace sequences containing CR or LF are eliminated entirely.
 345         already_trimmed = true;
 346         --chars_written;
 347       }
 348     } else {
 349       // Non-whitespace chracters are copied straight across.
 350       in_whitespace = false;
 351       already_trimmed = false;
 352       result[chars_written++] = *i;
 353     }
 354   }
 355
 356   if (in_whitespace && !already_trimmed) {
 357     // Any trailing whitespace is eliminated.
 358     --chars_written;
 359   }
 360
 361   result.resize(chars_written);
 362   return result;
 363 }
 364
 365 std::wstring CollapseWhitespace(const std::wstring& text,
 366                                 bool trim_sequences_with_line_breaks) {
 367   return CollapseWhitespaceT(text, trim_sequences_with_line_breaks);
 368 }
 369
 370 #if !defined(WCHAR_T_IS_UTF16)
 371 string16 CollapseWhitespace(const string16& text,
 372                             bool trim_sequences_with_line_breaks) {
 373   return CollapseWhitespaceT(text, trim_sequences_with_line_breaks);
 374 }
 375 #endif
 376
 377 std::string CollapseWhitespaceASCII(const std::string& text,
 378                                     bool trim_sequences_with_line_breaks) {
 379   return CollapseWhitespaceT(text, trim_sequences_with_line_breaks);
 380 }
 381
 382 bool ContainsOnlyWhitespaceASCII(const std::string& str) {
 383   for (std::string::const_iterator i(str.begin()); i != str.end(); ++i) {
 384     if (!IsAsciiWhitespace(*i))
 385       return false;
 386   }
 387   return true;
 388 }
 389
 390 bool ContainsOnlyWhitespace(const string16& str) {
 391   for (string16::const_iterator i(str.begin()); i != str.end(); ++i) {
 392     if (!IsWhitespace(*i))
 393       return false;
 394   }
 395   return true;
 396 }
 397
 398 template<typename STR>
 399 static bool ContainsOnlyCharsT(const STR& input, const STR& characters) {
 400   for (typename STR::const_iterator iter = input.begin();
 401        iter != input.end(); ++iter) {
 402     if (characters.find(*iter) == STR::npos)
 403       return false;
 404   }
 405   return true;
 406 }
 407
 408 bool ContainsOnlyChars(const std::wstring& input,
 409                        const std::wstring& characters) {
 410   return ContainsOnlyCharsT(input, characters);
 411 }
 412
 413 #if !defined(WCHAR_T_IS_UTF16)
 414 bool ContainsOnlyChars(const string16& input, const string16& characters) {
 415   return ContainsOnlyCharsT(input, characters);
 416 }
 417 #endif
 418
 419 bool ContainsOnlyChars(const std::string& input,
 420                        const std::string& characters) {
 421   return ContainsOnlyCharsT(input, characters);
 422 }
 423
 424 std::string WideToASCII(const std::wstring& wide) {
 425   DCHECK(IsStringASCII(wide)) << wide;
 426   return std::string(wide.begin(), wide.end());
 427 }
 428
 429 std::string UTF16ToASCII(const string16& utf16) {
 430   DCHECK(IsStringASCII(utf16)) << utf16;
 431   return std::string(utf16.begin(), utf16.end());
 432 }
 433
 434 // Latin1 is just the low range of Unicode, so we can copy directly to convert.
 435 bool WideToLatin1(const std::wstring& wide, std::string* latin1) {
 436   std::string output;
 437   output.resize(wide.size());
 438   latin1->clear();
 439   for (size_t i = 0; i < wide.size(); i++) {
 440     if (wide[i] > 255)
 441       return false;
 442     output[i] = static_cast<char>(wide[i]);
 443   }
 444   latin1->swap(output);
 445   return true;
 446 }
 447
 448 template<class STR>
 449 static bool DoIsStringASCII(const STR& str) {
 450   for (size_t i = 0; i < str.length(); i++) {
 451     typename ToUnsigned<typename STR::value_type>::Unsigned c = str[i];
 452     if (c > 0x7F)
 453       return false;
 454   }
 455   return true;
 456 }
 457
 458 bool IsStringASCII(const std::wstring& str) {
 459   return DoIsStringASCII(str);
 460 }
 461
 462 #if !defined(WCHAR_T_IS_UTF16)
 463 bool IsStringASCII(const string16& str) {
 464   return DoIsStringASCII(str);
 465 }
 466 #endif
 467
 468 bool IsStringASCII(const base::StringPiece& str) {
 469   return DoIsStringASCII(str);
 470 }
 471
 472 bool IsStringUTF8(const std::string& str) {
 473   const char *src = str.data();
 474   int32 src_len = static_cast<int32>(str.length());
 475   int32 char_index = 0;
 476
 477   while (char_index < src_len) {
 478     int32 code_point;
 479     CBU8_NEXT(src, char_index, src_len, code_point);
 480     if (!base::IsValidCharacter(code_point))
 481        return false;
 482   }
 483   return true;
 484 }
 485
 486 template<typename Iter>
 487 static inline bool DoLowerCaseEqualsASCII(Iter a_begin,
 488                                           Iter a_end,
 489                                           const char* b) {
 490   for (Iter it = a_begin; it != a_end; ++it, ++b) {
 491     if (!*b || base::ToLowerASCII(*it) != *b)
 492       return false;
 493   }
 494   return *b == 0;
 495 }
 496
 497 // Front-ends for LowerCaseEqualsASCII.
 498 bool LowerCaseEqualsASCII(const std::string& a, const char* b) {
 499   return DoLowerCaseEqualsASCII(a.begin(), a.end(), b);
 500 }
 501
 502 bool LowerCaseEqualsASCII(const std::wstring& a, const char* b) {
 503   return DoLowerCaseEqualsASCII(a.begin(), a.end(), b);
 504 }
 505
 506 #if !defined(WCHAR_T_IS_UTF16)
 507 bool LowerCaseEqualsASCII(const string16& a, const char* b) {
 508   return DoLowerCaseEqualsASCII(a.begin(), a.end(), b);
 509 }
 510 #endif
 511
 512 bool LowerCaseEqualsASCII(std::string::const_iterator a_begin,
 513                           std::string::const_iterator a_end,
 514                           const char* b) {
 515   return DoLowerCaseEqualsASCII(a_begin, a_end, b);
 516 }
 517
 518 bool LowerCaseEqualsASCII(std::wstring::const_iterator a_begin,
 519                           std::wstring::const_iterator a_end,
 520                           const char* b) {
 521   return DoLowerCaseEqualsASCII(a_begin, a_end, b);
 522 }
 523
 524 #if !defined(WCHAR_T_IS_UTF16)
 525 bool LowerCaseEqualsASCII(string16::const_iterator a_begin,
 526                           string16::const_iterator a_end,
 527                           const char* b) {
 528   return DoLowerCaseEqualsASCII(a_begin, a_end, b);
 529 }
 530 #endif
 531
 532 bool LowerCaseEqualsASCII(const char* a_begin,
 533                           const char* a_end,
 534                           const char* b) {
 535   return DoLowerCaseEqualsASCII(a_begin, a_end, b);
 536 }
 537
 538 bool LowerCaseEqualsASCII(const wchar_t* a_begin,
 539                           const wchar_t* a_end,
 540                           const char* b) {
 541   return DoLowerCaseEqualsASCII(a_begin, a_end, b);
 542 }
 543
 544 #if !defined(WCHAR_T_IS_UTF16)
 545 bool LowerCaseEqualsASCII(const char16* a_begin,
 546                           const char16* a_end,
 547                           const char* b) {
 548   return DoLowerCaseEqualsASCII(a_begin, a_end, b);
 549 }
 550 #endif
 551
 552 bool EqualsASCII(const string16& a, const base::StringPiece& b) {
 553   if (a.length() != b.length())
 554     return false;
 555   return std::equal(b.begin(), b.end(), a.begin());
 556 }
 557
 558 bool StartsWithASCII(const std::string& str,
 559                      const std::string& search,
 560                      bool case_sensitive) {
 561   if (case_sensitive)
 562     return str.compare(0, search.length(), search) == 0;
 563   else
 564     return base::strncasecmp(str.c_str(), search.c_str(), search.length()) == 0;
 565 }
 566
 567 template <typename STR>
 568 bool StartsWithT(const STR& str, const STR& search, bool case_sensitive) {
 569   if (case_sensitive) {
 570     return str.compare(0, search.length(), search) == 0;
 571   } else {
 572     if (search.size() > str.size())
 573       return false;
 574     return std::equal(search.begin(), search.end(), str.begin(),
 575                       base::CaseInsensitiveCompare<typename STR::value_type>());
 576   }
 577 }
 578
 579 bool StartsWith(const std::wstring& str, const std::wstring& search,
 580                 bool case_sensitive) {
 581   return StartsWithT(str, search, case_sensitive);
 582 }
 583
 584 #if !defined(WCHAR_T_IS_UTF16)
 585 bool StartsWith(const string16& str, const string16& search,
 586                 bool case_sensitive) {
 587   return StartsWithT(str, search, case_sensitive);
 588 }
 589 #endif
 590
 591 template <typename STR>
 592 bool EndsWithT(const STR& str, const STR& search, bool case_sensitive) {
 593   typename STR::size_type str_length = str.length();
 594   typename STR::size_type search_length = search.length();
 595   if (search_length > str_length)
 596     return false;
 597   if (case_sensitive) {
 598     return str.compare(str_length - search_length, search_length, search) == 0;
 599   } else {
 600     return std::equal(search.begin(), search.end(),
 601                       str.begin() + (str_length - search_length),
 602                       base::CaseInsensitiveCompare<typename STR::value_type>());
 603   }
 604 }
 605
 606 bool EndsWith(const std::string& str, const std::string& search,
 607               bool case_sensitive) {
 608   return EndsWithT(str, search, case_sensitive);
 609 }
 610
 611 bool EndsWith(const std::wstring& str, const std::wstring& search,
 612               bool case_sensitive) {
 613   return EndsWithT(str, search, case_sensitive);
 614 }
 615
 616 #if !defined(WCHAR_T_IS_UTF16)
 617 bool EndsWith(const string16& str, const string16& search,
 618               bool case_sensitive) {
 619   return EndsWithT(str, search, case_sensitive);
 620 }
 621 #endif
 622
 623 DataUnits GetByteDisplayUnits(int64 bytes) {
 624   // The byte thresholds at which we display amounts.  A byte count is displayed
 625   // in unit U when kUnitThresholds[U] <= bytes < kUnitThresholds[U+1].
 626   // This must match the DataUnits enum.
 627   static const int64 kUnitThresholds[] = {
 628     0,              // DATA_UNITS_BYTE,
 629     3*1024,         // DATA_UNITS_KIBIBYTE,
 630     2*1024*1024,    // DATA_UNITS_MEBIBYTE,
 631     1024*1024*1024  // DATA_UNITS_GIBIBYTE,
 632   };
 633
 634   if (bytes < 0) {
 635     NOTREACHED() << "Negative bytes value";
 636     return DATA_UNITS_BYTE;
 637   }
 638
 639   int unit_index = arraysize(kUnitThresholds);
 640   while (--unit_index > 0) {
 641     if (bytes >= kUnitThresholds[unit_index])
 642       break;
 643   }
 644
 645   DCHECK(unit_index >= DATA_UNITS_BYTE && unit_index <= DATA_UNITS_GIBIBYTE);
 646   return DataUnits(unit_index);
 647 }
 648
 649 // TODO(mpcomplete): deal with locale
 650 // Byte suffixes.  This must match the DataUnits enum.
 651 static const char* const kByteStrings[] = {
 652   "B",
 653   "kB",
 654   "MB",
 655   "GB"
 656 };
 657
 658 static const char* const kSpeedStrings[] = {
 659   "B/s",
 660   "kB/s",
 661   "MB/s",
 662   "GB/s"
 663 };
 664
 665 string16 FormatBytesInternal(int64 bytes,
 666                              DataUnits units,
 667                              bool show_units,
 668                              const char* const* suffix) {
 669   if (bytes < 0) {
 670     NOTREACHED() << "Negative bytes value";
 671     return string16();
 672   }
 673
 674   DCHECK(units >= DATA_UNITS_BYTE && units <= DATA_UNITS_GIBIBYTE);
 675
 676   // Put the quantity in the right units.
 677   double unit_amount = static_cast<double>(bytes);
 678   for (int i = 0; i < units; ++i)
 679     unit_amount /= 1024.0;
 680
 681   char buf[64];
 682   if (bytes != 0 && units != DATA_UNITS_BYTE && unit_amount < 100)
 683     base::snprintf(buf, arraysize(buf), "%.1lf", unit_amount);
 684   else
 685     base::snprintf(buf, arraysize(buf), "%.0lf", unit_amount);
 686
 687   std::string ret(buf);
 688   if (show_units) {
 689     ret += " ";
 690     ret += suffix[units];
 691   }
 692
 693   return ASCIIToUTF16(ret);
 694 }
 695
 696 string16 FormatBytes(int64 bytes, DataUnits units, bool show_units) {
 697   return FormatBytesInternal(bytes, units, show_units, kByteStrings);
 698 }
 699
 700 string16 FormatSpeed(int64 bytes, DataUnits units, bool show_units) {
 701   return FormatBytesInternal(bytes, units, show_units, kSpeedStrings);
 702 }
 703
 704 template<class StringType>
 705 void DoReplaceSubstringsAfterOffset(StringType* str,
 706                                     typename StringType::size_type start_offset,
 707                                     const StringType& find_this,
 708                                     const StringType& replace_with,
 709                                     bool replace_all) {
 710   if ((start_offset == StringType::npos) || (start_offset >= str->length()))
 711     return;
 712
 713   DCHECK(!find_this.empty());
 714   for (typename StringType::size_type offs(str->find(find_this, start_offset));
 715       offs != StringType::npos; offs = str->find(find_this, offs)) {
 716     str->replace(offs, find_this.length(), replace_with);
 717     offs += replace_with.length();
 718
 719     if (!replace_all)
 720       break;
 721   }
 722 }
 723
 724 void ReplaceFirstSubstringAfterOffset(string16* str,
 725                                       string16::size_type start_offset,
 726                                       const string16& find_this,
 727                                       const string16& replace_with) {
 728   DoReplaceSubstringsAfterOffset(str, start_offset, find_this, replace_with,
 729                                  false);  // replace first instance
 730 }
 731
 732 void ReplaceFirstSubstringAfterOffset(std::string* str,
 733                                       std::string::size_type start_offset,
 734                                       const std::string& find_this,
 735                                       const std::string& replace_with) {
 736   DoReplaceSubstringsAfterOffset(str, start_offset, find_this, replace_with,
 737                                  false);  // replace first instance
 738 }
 739
 740 void ReplaceSubstringsAfterOffset(string16* str,
 741                                   string16::size_type start_offset,
 742                                   const string16& find_this,
 743                                   const string16& replace_with) {
 744   DoReplaceSubstringsAfterOffset(str, start_offset, find_this, replace_with,
 745                                  true);  // replace all instances
 746 }
 747
 748 void ReplaceSubstringsAfterOffset(std::string* str,
 749                                   std::string::size_type start_offset,
 750                                   const std::string& find_this,
 751                                   const std::string& replace_with) {
 752   DoReplaceSubstringsAfterOffset(str, start_offset, find_this, replace_with,
 753                                  true);  // replace all instances
 754 }
 755
 756
 757 template<typename STR>
 758 static size_t TokenizeT(const STR& str,
 759                         const STR& delimiters,
 760                         std::vector<STR>* tokens) {
 761   tokens->clear();
 762
 763   typename STR::size_type start = str.find_first_not_of(delimiters);
 764   while (start != STR::npos) {
 765     typename STR::size_type end = str.find_first_of(delimiters, start + 1);
 766     if (end == STR::npos) {
 767       tokens->push_back(str.substr(start));
 768       break;
 769     } else {
 770       tokens->push_back(str.substr(start, end - start));
 771       start = str.find_first_not_of(delimiters, end + 1);
 772     }
 773   }
 774
 775   return tokens->size();
 776 }
 777
 778 size_t Tokenize(const std::wstring& str,
 779                 const std::wstring& delimiters,
 780                 std::vector<std::wstring>* tokens) {
 781   return TokenizeT(str, delimiters, tokens);
 782 }
 783
 784 #if !defined(WCHAR_T_IS_UTF16)
 785 size_t Tokenize(const string16& str,
 786                 const string16& delimiters,
 787                 std::vector<string16>* tokens) {
 788   return TokenizeT(str, delimiters, tokens);
 789 }
 790 #endif
 791
 792 size_t Tokenize(const std::string& str,
 793                 const std::string& delimiters,
 794                 std::vector<std::string>* tokens) {
 795   return TokenizeT(str, delimiters, tokens);
 796 }
 797
 798 size_t Tokenize(const base::StringPiece& str,
 799                 const base::StringPiece& delimiters,
 800                 std::vector<base::StringPiece>* tokens) {
 801   return TokenizeT(str, delimiters, tokens);
 802 }
 803
 804 template<typename STR>
 805 static STR JoinStringT(const std::vector<STR>& parts,
 806                        typename STR::value_type sep) {
 807   if (parts.empty())
 808     return STR();
 809
 810   STR result(parts[0]);
 811   typename std::vector<STR>::const_iterator iter = parts.begin();
 812   ++iter;
 813
 814   for (; iter != parts.end(); ++iter) {
 815     result += sep;
 816     result += *iter;
 817   }
 818
 819   return result;
 820 }
 821
 822 std::string JoinString(const std::vector<std::string>& parts, char sep) {
 823   return JoinStringT(parts, sep);
 824 }
 825
 826 string16 JoinString(const std::vector<string16>& parts, char16 sep) {
 827   return JoinStringT(parts, sep);
 828 }
 829
 830 template<class FormatStringType, class OutStringType>
 831 OutStringType DoReplaceStringPlaceholders(const FormatStringType& format_string,
 832     const std::vector<OutStringType>& subst, std::vector<size_t>* offsets) {
 833   size_t substitutions = subst.size();
 834   DCHECK(substitutions < 10);
 835
 836   size_t sub_length = 0;
 837   for (typename std::vector<OutStringType>::const_iterator iter = subst.begin();
 838        iter != subst.end(); ++iter) {
 839     sub_length += iter->length();
 840   }
 841
 842   OutStringType formatted;
 843   formatted.reserve(format_string.length() + sub_length);
 844
 845   std::vector<ReplacementOffset> r_offsets;
 846   for (typename FormatStringType::const_iterator i = format_string.begin();
 847        i != format_string.end(); ++i) {
 848     if ('$' == *i) {
 849       if (i + 1 != format_string.end()) {
 850         ++i;
 851         DCHECK('$' == *i || '1' <= *i) << "Invalid placeholder: " << *i;
 852         if ('$' == *i) {
 853           while (i != format_string.end() && '$' == *i) {
 854             formatted.push_back('$');
 855             ++i;
 856           }
 857           --i;
 858         } else {
 859           uintptr_t index = *i - '1';
 860           if (offsets) {
 861             ReplacementOffset r_offset(index,
 862                 static_cast<int>(formatted.size()));
 863             r_offsets.insert(std::lower_bound(r_offsets.begin(),
 864                                               r_offsets.end(),
 865                                               r_offset,
 866                                               &CompareParameter),
 867                              r_offset);
 868           }
 869           if (index < substitutions)
 870             formatted.append(subst.at(index));
 871         }
 872       }
 873     } else {
 874       formatted.push_back(*i);
 875     }
 876   }
 877   if (offsets) {
 878     for (std::vector<ReplacementOffset>::const_iterator i = r_offsets.begin();
 879          i != r_offsets.end(); ++i) {
 880       offsets->push_back(i->offset);
 881     }
 882   }
 883   return formatted;
 884 }
 885
 886 string16 ReplaceStringPlaceholders(const string16& format_string,
 887                                    const std::vector<string16>& subst,
 888                                    std::vector<size_t>* offsets) {
 889   return DoReplaceStringPlaceholders(format_string, subst, offsets);
 890 }
 891
 892 std::string ReplaceStringPlaceholders(const base::StringPiece& format_string,
 893                                       const std::vector<std::string>& subst,
 894                                       std::vector<size_t>* offsets) {
 895   return DoReplaceStringPlaceholders(format_string, subst, offsets);
 896 }
 897
 898 string16 ReplaceStringPlaceholders(const string16& format_string,
 899                                    const string16& a,
 900                                    size_t* offset) {
 901   std::vector<size_t> offsets;
 902   std::vector<string16> subst;
 903   subst.push_back(a);
 904   string16 result = ReplaceStringPlaceholders(format_string, subst, &offsets);
 905
 906   DCHECK(offsets.size() == 1);
 907   if (offset) {
 908     *offset = offsets[0];
 909   }
 910   return result;
 911 }
 912
 913 static bool IsWildcard(base_icu::UChar32 character) {
 914   return character == '*' || character == '?';
 915 }
 916
 917 // Move the strings pointers to the point where they start to differ.
 918 template <typename CHAR, typename NEXT>
 919 static void EatSameChars(const CHAR** pattern, const CHAR* pattern_end,
 920                          const CHAR** string, const CHAR* string_end,
 921                          NEXT next) {
 922   const CHAR* escape = NULL;
 923   while (*pattern != pattern_end && *string != string_end) {
 924     if (!escape && IsWildcard(**pattern)) {
 925       // We don't want to match wildcard here, except if it's escaped.
 926       return;
 927     }
 928
 929     // Check if the escapement char is found. If so, skip it and move to the
 930     // next character.
 931     if (!escape && **pattern == '\\') {
 932       escape = *pattern;
 933       next(pattern, pattern_end);
 934       continue;
 935     }
 936
 937     // Check if the chars match, if so, increment the ptrs.
 938     const CHAR* pattern_next = *pattern;
 939     const CHAR* string_next = *string;
 940     base_icu::UChar32 pattern_char = next(&pattern_next, pattern_end);
 941     if (pattern_char == next(&string_next, string_end) &&
 942         pattern_char != (base_icu::UChar32) CBU_SENTINEL) {
 943       *pattern = pattern_next;
 944       *string = string_next;
 945     } else {
 946       // Uh ho, it did not match, we are done. If the last char was an
 947       // escapement, that means that it was an error to advance the ptr here,
 948       // let's put it back where it was. This also mean that the MatchPattern
 949       // function will return false because if we can't match an escape char
 950       // here, then no one will.
 951       if (escape) {
 952         *pattern = escape;
 953       }
 954       return;
 955     }
 956
 957     escape = NULL;
 958   }
 959 }
 960
 961 template <typename CHAR, typename NEXT>
 962 static void EatWildcard(const CHAR** pattern, const CHAR* end, NEXT next) {
 963   while (*pattern != end) {
 964     if (!IsWildcard(**pattern))
 965       return;
 966     next(pattern, end);
 967   }
 968 }
 969
 970 template <typename CHAR, typename NEXT>
 971 static bool MatchPatternT(const CHAR* eval, const CHAR* eval_end,
 972                           const CHAR* pattern, const CHAR* pattern_end,
 973                           int depth,
 974                           NEXT next) {
 975   const int kMaxDepth = 16;
 976   if (depth > kMaxDepth)
 977     return false;
 978
 979   // Eat all the matching chars.
 980   EatSameChars(&pattern, pattern_end, &eval, eval_end, next);
 981
 982   // If the string is empty, then the pattern must be empty too, or contains
 983   // only wildcards.
 984   if (eval == eval_end) {
 985     EatWildcard(&pattern, pattern_end, next);
 986     return pattern == pattern_end;
 987   }
 988
 989   // Pattern is empty but not string, this is not a match.
 990   if (pattern == pattern_end)
 991     return false;
 992
 993   // If this is a question mark, then we need to compare the rest with
 994   // the current string or the string with one character eaten.
 995   const CHAR* next_pattern = pattern;
 996   next(&next_pattern, pattern_end);
 997   if (pattern[0] == '?') {
 998     if (MatchPatternT(eval, eval_end, next_pattern, pattern_end,
 999                       depth + 1, next))
1000       return true;
1001     const CHAR* next_eval = eval;
1002     next(&next_eval, eval_end);
1003     if (MatchPatternT(next_eval, eval_end, next_pattern, pattern_end,
1004                       depth + 1, next))
1005       return true;
1006   }
1007
1008   // This is a *, try to match all the possible substrings with the remainder
1009   // of the pattern.
1010   if (pattern[0] == '*') {
1011     // Collapse duplicate wild cards (********** into *) so that the
1012     // method does not recurse unnecessarily. http://crbug.com/52839
1013     EatWildcard(&next_pattern, pattern_end, next);
1014
1015     while (eval != eval_end) {
1016       if (MatchPatternT(eval, eval_end, next_pattern, pattern_end,
1017                         depth + 1, next))
1018         return true;
1019       eval++;
1020     }
1021
1022     // We reached the end of the string, let see if the pattern contains only
1023     // wildcards.
1024     if (eval == eval_end) {
1025       EatWildcard(&pattern, pattern_end, next);
1026       if (pattern != pattern_end)
1027         return false;
1028       return true;
1029     }
1030   }
1031
1032   return false;
1033 }
1034
1035 struct NextCharUTF8 {
1036   base_icu::UChar32 operator()(const char** p, const char* end) {
1037     base_icu::UChar32 c;
1038     int offset = 0;
1039     CBU8_NEXT(*p, offset, end - *p, c);
1040     *p += offset;
1041     return c;
1042   }
1043 };
1044
1045 struct NextCharUTF16 {
1046   base_icu::UChar32 operator()(const char16** p, const char16* end) {
1047     base_icu::UChar32 c;
1048     int offset = 0;
1049     CBU16_NEXT(*p, offset, end - *p, c);
1050     *p += offset;
1051     return c;
1052   }
1053 };
1054
1055 bool MatchPattern(const base::StringPiece& eval,
1056                   const base::StringPiece& pattern) {
1057   return MatchPatternT(eval.data(), eval.data() + eval.size(),
1058                        pattern.data(), pattern.data() + pattern.size(),
1059                        0, NextCharUTF8());
1060 }
1061
1062 bool MatchPattern(const string16& eval, const string16& pattern) {
1063   return MatchPatternT(eval.c_str(), eval.c_str() + eval.size(),
1064                        pattern.c_str(), pattern.c_str() + pattern.size(),
1065                        0, NextCharUTF16());
1066 }
1067
1068 // The following code is compatible with the OpenBSD lcpy interface.  See:
1069 //   http://www.gratisoft.us/todd/papers/strlcpy.html
1070 //   ftp://ftp.openbsd.org/pub/OpenBSD/src/lib/libc/string/{wcs,str}lcpy.c
1071
1072 namespace {
1073
1074 template <typename CHAR>
1075 size_t lcpyT(CHAR* dst, const CHAR* src, size_t dst_size) {
1076   for (size_t i = 0; i < dst_size; ++i) {
1077     if ((dst[i] = src[i]) == 0)  // We hit and copied the terminating NULL.
1078       return i;
1079   }
1080
1081   // We were left off at dst_size.  We over copied 1 byte.  Null terminate.
1082   if (dst_size != 0)
1083     dst[dst_size - 1] = 0;
1084
1085   // Count the rest of the |src|, and return it's length in characters.
1086   while (src[dst_size]) ++dst_size;
1087   return dst_size;
1088 }
1089
1090 }  // namespace
1091
1092 size_t base::strlcpy(char* dst, const char* src, size_t dst_size) {
1093   return lcpyT<char>(dst, src, dst_size);
1094 }
1095 size_t base::wcslcpy(wchar_t* dst, const wchar_t* src, size_t dst_size) {
1096   return lcpyT<wchar_t>(dst, src, dst_size);
1097 }