base/string_util.cc

   1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #include "base/string_util.h"
   6
   7 #include "build/build_config.h"
   8
   9 #include <ctype.h>
  10 #include <errno.h>
  11 #include <math.h>
  12 #include <stdarg.h>
  13 #include <stdio.h>
  14 #include <stdlib.h>
  15 #include <string.h>
  16 #include <time.h>
  17 #include <wchar.h>
  18 #include <wctype.h>
  19
  20 #include <algorithm>
  21 #include <vector>
  22
  23 #include "base/basictypes.h"
  24 #include "base/logging.h"
  25 #include "base/memory/singleton.h"
  26 #include "base/third_party/dmg_fp/dmg_fp.h"
  27 #include "base/utf_string_conversion_utils.h"
  28 #include "base/utf_string_conversions.h"
  29 #include "base/third_party/icu/icu_utf.h"
  30
  31 namespace {
  32
  33 // Force the singleton used by Empty[W]String[16] to be a unique type. This
  34 // prevents other code that might accidentally use Singleton<string> from
  35 // getting our internal one.
  36 struct EmptyStrings {
  37   EmptyStrings() {}
  38   const std::string s;
  39   const std::wstring ws;
  40   const string16 s16;
  41
  42   static EmptyStrings* GetInstance() {
  43     return Singleton<EmptyStrings>::get();
  44   }
  45 };
  46
  47 // Used by ReplaceStringPlaceholders to track the position in the string of
  48 // replaced parameters.
  49 struct ReplacementOffset {
  50   ReplacementOffset(uintptr_t parameter, size_t offset)
  51       : parameter(parameter),
  52         offset(offset) {}
  53
  54   // Index of the parameter.
  55   uintptr_t parameter;
  56
  57   // Starting position in the string.
  58   size_t offset;
  59 };
  60
  61 static bool CompareParameter(const ReplacementOffset& elem1,
  62                              const ReplacementOffset& elem2) {
  63   return elem1.parameter < elem2.parameter;
  64 }
  65
  66 }  // namespace
  67
  68 namespace base {
  69
  70 bool IsWprintfFormatPortable(const wchar_t* format) {
  71   for (const wchar_t* position = format; *position != '\0'; ++position) {
  72     if (*position == '%') {
  73       bool in_specification = true;
  74       bool modifier_l = false;
  75       while (in_specification) {
  76         // Eat up characters until reaching a known specifier.
  77         if (*++position == '\0') {
  78           // The format string ended in the middle of a specification.  Call
  79           // it portable because no unportable specifications were found.  The
  80           // string is equally broken on all platforms.
  81           return true;
  82         }
  83
  84         if (*position == 'l') {
  85           // 'l' is the only thing that can save the 's' and 'c' specifiers.
  86           modifier_l = true;
  87         } else if (((*position == 's' || *position == 'c') && !modifier_l) ||
  88                    *position == 'S' || *position == 'C' || *position == 'F' ||
  89                    *position == 'D' || *position == 'O' || *position == 'U') {
  90           // Not portable.
  91           return false;
  92         }
  93
  94         if (wcschr(L"diouxXeEfgGaAcspn%", *position)) {
  95           // Portable, keep scanning the rest of the format string.
  96           in_specification = false;
  97         }
  98       }
  99     }
 100   }
 101
 102   return true;
 103 }
 104
 105 }  // namespace base
 106
 107
 108 const std::string& EmptyString() {
 109   return EmptyStrings::GetInstance()->s;
 110 }
 111
 112 const std::wstring& EmptyWString() {
 113   return EmptyStrings::GetInstance()->ws;
 114 }
 115
 116 const string16& EmptyString16() {
 117   return EmptyStrings::GetInstance()->s16;
 118 }
 119
 120 #define WHITESPACE_UNICODE \
 121   0x0009, /* <control-0009> to <control-000D> */ \
 122   0x000A,                                        \
 123   0x000B,                                        \
 124   0x000C,                                        \
 125   0x000D,                                        \
 126   0x0020, /* Space */                            \
 127   0x0085, /* <control-0085> */                   \
 128   0x00A0, /* No-Break Space */                   \
 129   0x1680, /* Ogham Space Mark */                 \
 130   0x180E, /* Mongolian Vowel Separator */        \
 131   0x2000, /* En Quad to Hair Space */            \
 132   0x2001,                                        \
 133   0x2002,                                        \
 134   0x2003,                                        \
 135   0x2004,                                        \
 136   0x2005,                                        \
 137   0x2006,                                        \
 138   0x2007,                                        \
 139   0x2008,                                        \
 140   0x2009,                                        \
 141   0x200A,                                        \
 142   0x200C, /* Zero Width Non-Joiner */            \
 143   0x2028, /* Line Separator */                   \
 144   0x2029, /* Paragraph Separator */              \
 145   0x202F, /* Narrow No-Break Space */            \
 146   0x205F, /* Medium Mathematical Space */        \
 147   0x3000, /* Ideographic Space */                \
 148   0
 149
 150 const wchar_t kWhitespaceWide[] = {
 151   WHITESPACE_UNICODE
 152 };
 153 const char16 kWhitespaceUTF16[] = {
 154   WHITESPACE_UNICODE
 155 };
 156 const char kWhitespaceASCII[] = {
 157   0x09,    // <control-0009> to <control-000D>
 158   0x0A,
 159   0x0B,
 160   0x0C,
 161   0x0D,
 162   0x20,    // Space
 163   0
 164 };
 165
 166 const char kUtf8ByteOrderMark[] = "\xEF\xBB\xBF";
 167
 168 template<typename STR>
 169 bool RemoveCharsT(const STR& input,
 170                   const typename STR::value_type remove_chars[],
 171                   STR* output) {
 172   bool removed = false;
 173   size_t found;
 174
 175   *output = input;
 176
 177   found = output->find_first_of(remove_chars);
 178   while (found != STR::npos) {
 179     removed = true;
 180     output->replace(found, 1, STR());
 181     found = output->find_first_of(remove_chars, found);
 182   }
 183
 184   return removed;
 185 }
 186
 187 bool RemoveChars(const string16& input,
 188                  const char16 remove_chars[],
 189                  string16* output) {
 190   return RemoveCharsT(input, remove_chars, output);
 191 }
 192
 193 bool RemoveChars(const std::string& input,
 194                  const char remove_chars[],
 195                  std::string* output) {
 196   return RemoveCharsT(input, remove_chars, output);
 197 }
 198
 199 template<typename STR>
 200 TrimPositions TrimStringT(const STR& input,
 201                           const typename STR::value_type trim_chars[],
 202                           TrimPositions positions,
 203                           STR* output) {
 204   // Find the edges of leading/trailing whitespace as desired.
 205   const typename STR::size_type last_char = input.length() - 1;
 206   const typename STR::size_type first_good_char = (positions & TRIM_LEADING) ?
 207       input.find_first_not_of(trim_chars) : 0;
 208   const typename STR::size_type last_good_char = (positions & TRIM_TRAILING) ?
 209       input.find_last_not_of(trim_chars) : last_char;
 210
 211   // When the string was all whitespace, report that we stripped off whitespace
 212   // from whichever position the caller was interested in.  For empty input, we
 213   // stripped no whitespace, but we still need to clear |output|.
 214   if (input.empty() ||
 215       (first_good_char == STR::npos) || (last_good_char == STR::npos)) {
 216     bool input_was_empty = input.empty();  // in case output == &input
 217     output->clear();
 218     return input_was_empty ? TRIM_NONE : positions;
 219   }
 220
 221   // Trim the whitespace.
 222   *output =
 223       input.substr(first_good_char, last_good_char - first_good_char + 1);
 224
 225   // Return where we trimmed from.
 226   return static_cast<TrimPositions>(
 227       ((first_good_char == 0) ? TRIM_NONE : TRIM_LEADING) |
 228       ((last_good_char == last_char) ? TRIM_NONE : TRIM_TRAILING));
 229 }
 230
 231 bool TrimString(const std::wstring& input,
 232                 const wchar_t trim_chars[],
 233                 std::wstring* output) {
 234   return TrimStringT(input, trim_chars, TRIM_ALL, output) != TRIM_NONE;
 235 }
 236
 237 #if !defined(WCHAR_T_IS_UTF16)
 238 bool TrimString(const string16& input,
 239                 const char16 trim_chars[],
 240                 string16* output) {
 241   return TrimStringT(input, trim_chars, TRIM_ALL, output) != TRIM_NONE;
 242 }
 243 #endif
 244
 245 bool TrimString(const std::string& input,
 246                 const char trim_chars[],
 247                 std::string* output) {
 248   return TrimStringT(input, trim_chars, TRIM_ALL, output) != TRIM_NONE;
 249 }
 250
 251 void TruncateUTF8ToByteSize(const std::string& input,
 252                             const size_t byte_size,
 253                             std::string* output) {
 254   DCHECK(output);
 255   if (byte_size > input.length()) {
 256     *output = input;
 257     return;
 258   }
 259   DCHECK_LE(byte_size, static_cast<uint32>(kint32max));
 260   // Note: This cast is necessary because CBU8_NEXT uses int32s.
 261   int32 truncation_length = static_cast<int32>(byte_size);
 262   int32 char_index = truncation_length - 1;
 263   const char* data = input.data();
 264
 265   // Using CBU8, we will move backwards from the truncation point
 266   // to the beginning of the string looking for a valid UTF8
 267   // character.  Once a full UTF8 character is found, we will
 268   // truncate the string to the end of that character.
 269   while (char_index >= 0) {
 270     int32 prev = char_index;
 271     uint32 code_point = 0;
 272     CBU8_NEXT(data, char_index, truncation_length, code_point);
 273     if (!base::IsValidCharacter(code_point) ||
 274         !base::IsValidCodepoint(code_point)) {
 275       char_index = prev - 1;
 276     } else {
 277       break;
 278     }
 279   }
 280
 281   if (char_index >= 0 )
 282     *output = input.substr(0, char_index);
 283   else
 284     output->clear();
 285 }
 286
 287 TrimPositions TrimWhitespace(const string16& input,
 288                              TrimPositions positions,
 289                              string16* output) {
 290   return TrimStringT(input, kWhitespaceUTF16, positions, output);
 291 }
 292
 293 TrimPositions TrimWhitespaceASCII(const std::string& input,
 294                                   TrimPositions positions,
 295                                   std::string* output) {
 296   return TrimStringT(input, kWhitespaceASCII, positions, output);
 297 }
 298
 299 // This function is only for backward-compatibility.
 300 // To be removed when all callers are updated.
 301 TrimPositions TrimWhitespace(const std::string& input,
 302                              TrimPositions positions,
 303                              std::string* output) {
 304   return TrimWhitespaceASCII(input, positions, output);
 305 }
 306
 307 template<typename STR>
 308 STR CollapseWhitespaceT(const STR& text,
 309                         bool trim_sequences_with_line_breaks) {
 310   STR result;
 311   result.resize(text.size());
 312
 313   // Set flags to pretend we're already in a trimmed whitespace sequence, so we
 314   // will trim any leading whitespace.
 315   bool in_whitespace = true;
 316   bool already_trimmed = true;
 317
 318   int chars_written = 0;
 319   for (typename STR::const_iterator i(text.begin()); i != text.end(); ++i) {
 320     if (IsWhitespace(*i)) {
 321       if (!in_whitespace) {
 322         // Reduce all whitespace sequences to a single space.
 323         in_whitespace = true;
 324         result[chars_written++] = L' ';
 325       }
 326       if (trim_sequences_with_line_breaks && !already_trimmed &&
 327           ((*i == '\n') || (*i == '\r'))) {
 328         // Whitespace sequences containing CR or LF are eliminated entirely.
 329         already_trimmed = true;
 330         --chars_written;
 331       }
 332     } else {
 333       // Non-whitespace chracters are copied straight across.
 334       in_whitespace = false;
 335       already_trimmed = false;
 336       result[chars_written++] = *i;
 337     }
 338   }
 339
 340   if (in_whitespace && !already_trimmed) {
 341     // Any trailing whitespace is eliminated.
 342     --chars_written;
 343   }
 344
 345   result.resize(chars_written);
 346   return result;
 347 }
 348
 349 std::wstring CollapseWhitespace(const std::wstring& text,
 350                                 bool trim_sequences_with_line_breaks) {
 351   return CollapseWhitespaceT(text, trim_sequences_with_line_breaks);
 352 }
 353
 354 #if !defined(WCHAR_T_IS_UTF16)
 355 string16 CollapseWhitespace(const string16& text,
 356                             bool trim_sequences_with_line_breaks) {
 357   return CollapseWhitespaceT(text, trim_sequences_with_line_breaks);
 358 }
 359 #endif
 360
 361 std::string CollapseWhitespaceASCII(const std::string& text,
 362                                     bool trim_sequences_with_line_breaks) {
 363   return CollapseWhitespaceT(text, trim_sequences_with_line_breaks);
 364 }
 365
 366 bool ContainsOnlyWhitespaceASCII(const std::string& str) {
 367   for (std::string::const_iterator i(str.begin()); i != str.end(); ++i) {
 368     if (!IsAsciiWhitespace(*i))
 369       return false;
 370   }
 371   return true;
 372 }
 373
 374 bool ContainsOnlyWhitespace(const string16& str) {
 375   for (string16::const_iterator i(str.begin()); i != str.end(); ++i) {
 376     if (!IsWhitespace(*i))
 377       return false;
 378   }
 379   return true;
 380 }
 381
 382 template<typename STR>
 383 static bool ContainsOnlyCharsT(const STR& input, const STR& characters) {
 384   for (typename STR::const_iterator iter = input.begin();
 385        iter != input.end(); ++iter) {
 386     if (characters.find(*iter) == STR::npos)
 387       return false;
 388   }
 389   return true;
 390 }
 391
 392 bool ContainsOnlyChars(const std::wstring& input,
 393                        const std::wstring& characters) {
 394   return ContainsOnlyCharsT(input, characters);
 395 }
 396
 397 #if !defined(WCHAR_T_IS_UTF16)
 398 bool ContainsOnlyChars(const string16& input, const string16& characters) {
 399   return ContainsOnlyCharsT(input, characters);
 400 }
 401 #endif
 402
 403 bool ContainsOnlyChars(const std::string& input,
 404                        const std::string& characters) {
 405   return ContainsOnlyCharsT(input, characters);
 406 }
 407
 408 std::string WideToASCII(const std::wstring& wide) {
 409   DCHECK(IsStringASCII(wide)) << wide;
 410   return std::string(wide.begin(), wide.end());
 411 }
 412
 413 std::string UTF16ToASCII(const string16& utf16) {
 414   DCHECK(IsStringASCII(utf16)) << utf16;
 415   return std::string(utf16.begin(), utf16.end());
 416 }
 417
 418 // Latin1 is just the low range of Unicode, so we can copy directly to convert.
 419 bool WideToLatin1(const std::wstring& wide, std::string* latin1) {
 420   std::string output;
 421   output.resize(wide.size());
 422   latin1->clear();
 423   for (size_t i = 0; i < wide.size(); i++) {
 424     if (wide[i] > 255)
 425       return false;
 426     output[i] = static_cast<char>(wide[i]);
 427   }
 428   latin1->swap(output);
 429   return true;
 430 }
 431
 432 template<class STR>
 433 static bool DoIsStringASCII(const STR& str) {
 434   for (size_t i = 0; i < str.length(); i++) {
 435     typename ToUnsigned<typename STR::value_type>::Unsigned c = str[i];
 436     if (c > 0x7F)
 437       return false;
 438   }
 439   return true;
 440 }
 441
 442 bool IsStringASCII(const std::wstring& str) {
 443   return DoIsStringASCII(str);
 444 }
 445
 446 #if !defined(WCHAR_T_IS_UTF16)
 447 bool IsStringASCII(const string16& str) {
 448   return DoIsStringASCII(str);
 449 }
 450 #endif
 451
 452 bool IsStringASCII(const base::StringPiece& str) {
 453   return DoIsStringASCII(str);
 454 }
 455
 456 bool IsStringUTF8(const std::string& str) {
 457   const char *src = str.data();
 458   int32 src_len = static_cast<int32>(str.length());
 459   int32 char_index = 0;
 460
 461   while (char_index < src_len) {
 462     int32 code_point;
 463     CBU8_NEXT(src, char_index, src_len, code_point);
 464     if (!base::IsValidCharacter(code_point))
 465        return false;
 466   }
 467   return true;
 468 }
 469
 470 template<typename Iter>
 471 static inline bool DoLowerCaseEqualsASCII(Iter a_begin,
 472                                           Iter a_end,
 473                                           const char* b) {
 474   for (Iter it = a_begin; it != a_end; ++it, ++b) {
 475     if (!*b || base::ToLowerASCII(*it) != *b)
 476       return false;
 477   }
 478   return *b == 0;
 479 }
 480
 481 // Front-ends for LowerCaseEqualsASCII.
 482 bool LowerCaseEqualsASCII(const std::string& a, const char* b) {
 483   return DoLowerCaseEqualsASCII(a.begin(), a.end(), b);
 484 }
 485
 486 bool LowerCaseEqualsASCII(const std::wstring& a, const char* b) {
 487   return DoLowerCaseEqualsASCII(a.begin(), a.end(), b);
 488 }
 489
 490 #if !defined(WCHAR_T_IS_UTF16)
 491 bool LowerCaseEqualsASCII(const string16& a, const char* b) {
 492   return DoLowerCaseEqualsASCII(a.begin(), a.end(), b);
 493 }
 494 #endif
 495
 496 bool LowerCaseEqualsASCII(std::string::const_iterator a_begin,
 497                           std::string::const_iterator a_end,
 498                           const char* b) {
 499   return DoLowerCaseEqualsASCII(a_begin, a_end, b);
 500 }
 501
 502 bool LowerCaseEqualsASCII(std::wstring::const_iterator a_begin,
 503                           std::wstring::const_iterator a_end,
 504                           const char* b) {
 505   return DoLowerCaseEqualsASCII(a_begin, a_end, b);
 506 }
 507
 508 #if !defined(WCHAR_T_IS_UTF16)
 509 bool LowerCaseEqualsASCII(string16::const_iterator a_begin,
 510                           string16::const_iterator a_end,
 511                           const char* b) {
 512   return DoLowerCaseEqualsASCII(a_begin, a_end, b);
 513 }
 514 #endif
 515
 516 // TODO(port): Resolve wchar_t/iterator issues that require OS_ANDROID here.
 517 #if !defined(OS_ANDROID)
 518 bool LowerCaseEqualsASCII(const char* a_begin,
 519                           const char* a_end,
 520                           const char* b) {
 521   return DoLowerCaseEqualsASCII(a_begin, a_end, b);
 522 }
 523
 524 bool LowerCaseEqualsASCII(const wchar_t* a_begin,
 525                           const wchar_t* a_end,
 526                           const char* b) {
 527   return DoLowerCaseEqualsASCII(a_begin, a_end, b);
 528 }
 529
 530 #if !defined(WCHAR_T_IS_UTF16)
 531 bool LowerCaseEqualsASCII(const char16* a_begin,
 532                           const char16* a_end,
 533                           const char* b) {
 534   return DoLowerCaseEqualsASCII(a_begin, a_end, b);
 535 }
 536 #endif
 537
 538 #endif  // !defined(OS_ANDROID)
 539
 540 bool EqualsASCII(const string16& a, const base::StringPiece& b) {
 541   if (a.length() != b.length())
 542     return false;
 543   return std::equal(b.begin(), b.end(), a.begin());
 544 }
 545
 546 bool StartsWithASCII(const std::string& str,
 547                      const std::string& search,
 548                      bool case_sensitive) {
 549   if (case_sensitive)
 550     return str.compare(0, search.length(), search) == 0;
 551   else
 552     return base::strncasecmp(str.c_str(), search.c_str(), search.length()) == 0;
 553 }
 554
 555 template <typename STR>
 556 bool StartsWithT(const STR& str, const STR& search, bool case_sensitive) {
 557   if (case_sensitive) {
 558     return str.compare(0, search.length(), search) == 0;
 559   } else {
 560     if (search.size() > str.size())
 561       return false;
 562     return std::equal(search.begin(), search.end(), str.begin(),
 563                       base::CaseInsensitiveCompare<typename STR::value_type>());
 564   }
 565 }
 566
 567 bool StartsWith(const std::wstring& str, const std::wstring& search,
 568                 bool case_sensitive) {
 569   return StartsWithT(str, search, case_sensitive);
 570 }
 571
 572 #if !defined(WCHAR_T_IS_UTF16)
 573 bool StartsWith(const string16& str, const string16& search,
 574                 bool case_sensitive) {
 575   return StartsWithT(str, search, case_sensitive);
 576 }
 577 #endif
 578
 579 template <typename STR>
 580 bool EndsWithT(const STR& str, const STR& search, bool case_sensitive) {
 581   typename STR::size_type str_length = str.length();
 582   typename STR::size_type search_length = search.length();
 583   if (search_length > str_length)
 584     return false;
 585   if (case_sensitive) {
 586     return str.compare(str_length - search_length, search_length, search) == 0;
 587   } else {
 588     return std::equal(search.begin(), search.end(),
 589                       str.begin() + (str_length - search_length),
 590                       base::CaseInsensitiveCompare<typename STR::value_type>());
 591   }
 592 }
 593
 594 bool EndsWith(const std::string& str, const std::string& search,
 595               bool case_sensitive) {
 596   return EndsWithT(str, search, case_sensitive);
 597 }
 598
 599 bool EndsWith(const std::wstring& str, const std::wstring& search,
 600               bool case_sensitive) {
 601   return EndsWithT(str, search, case_sensitive);
 602 }
 603
 604 #if !defined(WCHAR_T_IS_UTF16)
 605 bool EndsWith(const string16& str, const string16& search,
 606               bool case_sensitive) {
 607   return EndsWithT(str, search, case_sensitive);
 608 }
 609 #endif
 610
 611 static const char* const kByteStringsUnlocalized[] = {
 612   " B",
 613   " kB",
 614   " MB",
 615   " GB",
 616   " TB",
 617   " PB"
 618 };
 619
 620 string16 FormatBytesUnlocalized(int64 bytes) {
 621   double unit_amount = static_cast<double>(bytes);
 622   size_t dimension = 0;
 623   const int kKilo = 1024;
 624   while (unit_amount >= kKilo &&
 625          dimension < arraysize(kByteStringsUnlocalized) - 1) {
 626     unit_amount /= kKilo;
 627     dimension++;
 628   }
 629
 630   char buf[64];
 631   if (bytes != 0 && dimension > 0 && unit_amount < 100) {
 632     base::snprintf(buf, arraysize(buf), "%.1lf%s", unit_amount,
 633                    kByteStringsUnlocalized[dimension]);
 634   } else {
 635     base::snprintf(buf, arraysize(buf), "%.0lf%s", unit_amount,
 636                    kByteStringsUnlocalized[dimension]);
 637   }
 638
 639   return ASCIIToUTF16(buf);
 640 }
 641
 642 template<class StringType>
 643 void DoReplaceSubstringsAfterOffset(StringType* str,
 644                                     typename StringType::size_type start_offset,
 645                                     const StringType& find_this,
 646                                     const StringType& replace_with,
 647                                     bool replace_all) {
 648   if ((start_offset == StringType::npos) || (start_offset >= str->length()))
 649     return;
 650
 651   DCHECK(!find_this.empty());
 652   for (typename StringType::size_type offs(str->find(find_this, start_offset));
 653       offs != StringType::npos; offs = str->find(find_this, offs)) {
 654     str->replace(offs, find_this.length(), replace_with);
 655     offs += replace_with.length();
 656
 657     if (!replace_all)
 658       break;
 659   }
 660 }
 661
 662 void ReplaceFirstSubstringAfterOffset(string16* str,
 663                                       string16::size_type start_offset,
 664                                       const string16& find_this,
 665                                       const string16& replace_with) {
 666   DoReplaceSubstringsAfterOffset(str, start_offset, find_this, replace_with,
 667                                  false);  // replace first instance
 668 }
 669
 670 void ReplaceFirstSubstringAfterOffset(std::string* str,
 671                                       std::string::size_type start_offset,
 672                                       const std::string& find_this,
 673                                       const std::string& replace_with) {
 674   DoReplaceSubstringsAfterOffset(str, start_offset, find_this, replace_with,
 675                                  false);  // replace first instance
 676 }
 677
 678 void ReplaceSubstringsAfterOffset(string16* str,
 679                                   string16::size_type start_offset,
 680                                   const string16& find_this,
 681                                   const string16& replace_with) {
 682   DoReplaceSubstringsAfterOffset(str, start_offset, find_this, replace_with,
 683                                  true);  // replace all instances
 684 }
 685
 686 void ReplaceSubstringsAfterOffset(std::string* str,
 687                                   std::string::size_type start_offset,
 688                                   const std::string& find_this,
 689                                   const std::string& replace_with) {
 690   DoReplaceSubstringsAfterOffset(str, start_offset, find_this, replace_with,
 691                                  true);  // replace all instances
 692 }
 693
 694
 695 template<typename STR>
 696 static size_t TokenizeT(const STR& str,
 697                         const STR& delimiters,
 698                         std::vector<STR>* tokens) {
 699   tokens->clear();
 700
 701   typename STR::size_type start = str.find_first_not_of(delimiters);
 702   while (start != STR::npos) {
 703     typename STR::size_type end = str.find_first_of(delimiters, start + 1);
 704     if (end == STR::npos) {
 705       tokens->push_back(str.substr(start));
 706       break;
 707     } else {
 708       tokens->push_back(str.substr(start, end - start));
 709       start = str.find_first_not_of(delimiters, end + 1);
 710     }
 711   }
 712
 713   return tokens->size();
 714 }
 715
 716 size_t Tokenize(const std::wstring& str,
 717                 const std::wstring& delimiters,
 718                 std::vector<std::wstring>* tokens) {
 719   return TokenizeT(str, delimiters, tokens);
 720 }
 721
 722 #if !defined(WCHAR_T_IS_UTF16)
 723 size_t Tokenize(const string16& str,
 724                 const string16& delimiters,
 725                 std::vector<string16>* tokens) {
 726   return TokenizeT(str, delimiters, tokens);
 727 }
 728 #endif
 729
 730 size_t Tokenize(const std::string& str,
 731                 const std::string& delimiters,
 732                 std::vector<std::string>* tokens) {
 733   return TokenizeT(str, delimiters, tokens);
 734 }
 735
 736 size_t Tokenize(const base::StringPiece& str,
 737                 const base::StringPiece& delimiters,
 738                 std::vector<base::StringPiece>* tokens) {
 739   return TokenizeT(str, delimiters, tokens);
 740 }
 741
 742 template<typename STR>
 743 static STR JoinStringT(const std::vector<STR>& parts,
 744                        typename STR::value_type sep) {
 745   if (parts.empty())
 746     return STR();
 747
 748   STR result(parts[0]);
 749   typename std::vector<STR>::const_iterator iter = parts.begin();
 750   ++iter;
 751
 752   for (; iter != parts.end(); ++iter) {
 753     result += sep;
 754     result += *iter;
 755   }
 756
 757   return result;
 758 }
 759
 760 std::string JoinString(const std::vector<std::string>& parts, char sep) {
 761   return JoinStringT(parts, sep);
 762 }
 763
 764 string16 JoinString(const std::vector<string16>& parts, char16 sep) {
 765   return JoinStringT(parts, sep);
 766 }
 767
 768 template<class FormatStringType, class OutStringType>
 769 OutStringType DoReplaceStringPlaceholders(const FormatStringType& format_string,
 770     const std::vector<OutStringType>& subst, std::vector<size_t>* offsets) {
 771   size_t substitutions = subst.size();
 772
 773   size_t sub_length = 0;
 774   for (typename std::vector<OutStringType>::const_iterator iter = subst.begin();
 775        iter != subst.end(); ++iter) {
 776     sub_length += iter->length();
 777   }
 778
 779   OutStringType formatted;
 780   formatted.reserve(format_string.length() + sub_length);
 781
 782   std::vector<ReplacementOffset> r_offsets;
 783   for (typename FormatStringType::const_iterator i = format_string.begin();
 784        i != format_string.end(); ++i) {
 785     if ('$' == *i) {
 786       if (i + 1 != format_string.end()) {
 787         ++i;
 788         DCHECK('$' == *i || '1' <= *i) << "Invalid placeholder: " << *i;
 789         if ('$' == *i) {
 790           while (i != format_string.end() && '$' == *i) {
 791             formatted.push_back('$');
 792             ++i;
 793           }
 794           --i;
 795         } else {
 796           uintptr_t index = 0;
 797           while (i != format_string.end() && '0' <= *i && *i <= '9') {
 798             index *= 10;
 799             index += *i - '0';
 800             ++i;
 801           }
 802           --i;
 803           index -= 1;
 804           if (offsets) {
 805             ReplacementOffset r_offset(index,
 806                 static_cast<int>(formatted.size()));
 807             r_offsets.insert(std::lower_bound(r_offsets.begin(),
 808                                               r_offsets.end(),
 809                                               r_offset,
 810                                               &CompareParameter),
 811                              r_offset);
 812           }
 813           if (index < substitutions)
 814             formatted.append(subst.at(index));
 815         }
 816       }
 817     } else {
 818       formatted.push_back(*i);
 819     }
 820   }
 821   if (offsets) {
 822     for (std::vector<ReplacementOffset>::const_iterator i = r_offsets.begin();
 823          i != r_offsets.end(); ++i) {
 824       offsets->push_back(i->offset);
 825     }
 826   }
 827   return formatted;
 828 }
 829
 830 string16 ReplaceStringPlaceholders(const string16& format_string,
 831                                    const std::vector<string16>& subst,
 832                                    std::vector<size_t>* offsets) {
 833   return DoReplaceStringPlaceholders(format_string, subst, offsets);
 834 }
 835
 836 std::string ReplaceStringPlaceholders(const base::StringPiece& format_string,
 837                                       const std::vector<std::string>& subst,
 838                                       std::vector<size_t>* offsets) {
 839   return DoReplaceStringPlaceholders(format_string, subst, offsets);
 840 }
 841
 842 string16 ReplaceStringPlaceholders(const string16& format_string,
 843                                    const string16& a,
 844                                    size_t* offset) {
 845   std::vector<size_t> offsets;
 846   std::vector<string16> subst;
 847   subst.push_back(a);
 848   string16 result = ReplaceStringPlaceholders(format_string, subst, &offsets);
 849
 850   DCHECK(offsets.size() == 1);
 851   if (offset) {
 852     *offset = offsets[0];
 853   }
 854   return result;
 855 }
 856
 857 static bool IsWildcard(base_icu::UChar32 character) {
 858   return character == '*' || character == '?';
 859 }
 860
 861 // Move the strings pointers to the point where they start to differ.
 862 template <typename CHAR, typename NEXT>
 863 static void EatSameChars(const CHAR** pattern, const CHAR* pattern_end,
 864                          const CHAR** string, const CHAR* string_end,
 865                          NEXT next) {
 866   const CHAR* escape = NULL;
 867   while (*pattern != pattern_end && *string != string_end) {
 868     if (!escape && IsWildcard(**pattern)) {
 869       // We don't want to match wildcard here, except if it's escaped.
 870       return;
 871     }
 872
 873     // Check if the escapement char is found. If so, skip it and move to the
 874     // next character.
 875     if (!escape && **pattern == '\\') {
 876       escape = *pattern;
 877       next(pattern, pattern_end);
 878       continue;
 879     }
 880
 881     // Check if the chars match, if so, increment the ptrs.
 882     const CHAR* pattern_next = *pattern;
 883     const CHAR* string_next = *string;
 884     base_icu::UChar32 pattern_char = next(&pattern_next, pattern_end);
 885     if (pattern_char == next(&string_next, string_end) &&
 886         pattern_char != (base_icu::UChar32) CBU_SENTINEL) {
 887       *pattern = pattern_next;
 888       *string = string_next;
 889     } else {
 890       // Uh ho, it did not match, we are done. If the last char was an
 891       // escapement, that means that it was an error to advance the ptr here,
 892       // let's put it back where it was. This also mean that the MatchPattern
 893       // function will return false because if we can't match an escape char
 894       // here, then no one will.
 895       if (escape) {
 896         *pattern = escape;
 897       }
 898       return;
 899     }
 900
 901     escape = NULL;
 902   }
 903 }
 904
 905 template <typename CHAR, typename NEXT>
 906 static void EatWildcard(const CHAR** pattern, const CHAR* end, NEXT next) {
 907   while (*pattern != end) {
 908     if (!IsWildcard(**pattern))
 909       return;
 910     next(pattern, end);
 911   }
 912 }
 913
 914 template <typename CHAR, typename NEXT>
 915 static bool MatchPatternT(const CHAR* eval, const CHAR* eval_end,
 916                           const CHAR* pattern, const CHAR* pattern_end,
 917                           int depth,
 918                           NEXT next) {
 919   const int kMaxDepth = 16;
 920   if (depth > kMaxDepth)
 921     return false;
 922
 923   // Eat all the matching chars.
 924   EatSameChars(&pattern, pattern_end, &eval, eval_end, next);
 925
 926   // If the string is empty, then the pattern must be empty too, or contains
 927   // only wildcards.
 928   if (eval == eval_end) {
 929     EatWildcard(&pattern, pattern_end, next);
 930     return pattern == pattern_end;
 931   }
 932
 933   // Pattern is empty but not string, this is not a match.
 934   if (pattern == pattern_end)
 935     return false;
 936
 937   // If this is a question mark, then we need to compare the rest with
 938   // the current string or the string with one character eaten.
 939   const CHAR* next_pattern = pattern;
 940   next(&next_pattern, pattern_end);
 941   if (pattern[0] == '?') {
 942     if (MatchPatternT(eval, eval_end, next_pattern, pattern_end,
 943                       depth + 1, next))
 944       return true;
 945     const CHAR* next_eval = eval;
 946     next(&next_eval, eval_end);
 947     if (MatchPatternT(next_eval, eval_end, next_pattern, pattern_end,
 948                       depth + 1, next))
 949       return true;
 950   }
 951
 952   // This is a *, try to match all the possible substrings with the remainder
 953   // of the pattern.
 954   if (pattern[0] == '*') {
 955     // Collapse duplicate wild cards (********** into *) so that the
 956     // method does not recurse unnecessarily. http://crbug.com/52839
 957     EatWildcard(&next_pattern, pattern_end, next);
 958
 959     while (eval != eval_end) {
 960       if (MatchPatternT(eval, eval_end, next_pattern, pattern_end,
 961                         depth + 1, next))
 962         return true;
 963       eval++;
 964     }
 965
 966     // We reached the end of the string, let see if the pattern contains only
 967     // wildcards.
 968     if (eval == eval_end) {
 969       EatWildcard(&pattern, pattern_end, next);
 970       if (pattern != pattern_end)
 971         return false;
 972       return true;
 973     }
 974   }
 975
 976   return false;
 977 }
 978
 979 struct NextCharUTF8 {
 980   base_icu::UChar32 operator()(const char** p, const char* end) {
 981     base_icu::UChar32 c;
 982     int offset = 0;
 983     CBU8_NEXT(*p, offset, end - *p, c);
 984     *p += offset;
 985     return c;
 986   }
 987 };
 988
 989 struct NextCharUTF16 {
 990   base_icu::UChar32 operator()(const char16** p, const char16* end) {
 991     base_icu::UChar32 c;
 992     int offset = 0;
 993     CBU16_NEXT(*p, offset, end - *p, c);
 994     *p += offset;
 995     return c;
 996   }
 997 };
 998
 999 bool MatchPattern(const base::StringPiece& eval,
1000                   const base::StringPiece& pattern) {
1001   return MatchPatternT(eval.data(), eval.data() + eval.size(),
1002                        pattern.data(), pattern.data() + pattern.size(),
1003                        0, NextCharUTF8());
1004 }
1005
1006 bool MatchPattern(const string16& eval, const string16& pattern) {
1007   return MatchPatternT(eval.c_str(), eval.c_str() + eval.size(),
1008                        pattern.c_str(), pattern.c_str() + pattern.size(),
1009                        0, NextCharUTF16());
1010 }
1011
1012 // The following code is compatible with the OpenBSD lcpy interface.  See:
1013 //   http://www.gratisoft.us/todd/papers/strlcpy.html
1014 //   ftp://ftp.openbsd.org/pub/OpenBSD/src/lib/libc/string/{wcs,str}lcpy.c
1015
1016 namespace {
1017
1018 template <typename CHAR>
1019 size_t lcpyT(CHAR* dst, const CHAR* src, size_t dst_size) {
1020   for (size_t i = 0; i < dst_size; ++i) {
1021     if ((dst[i] = src[i]) == 0)  // We hit and copied the terminating NULL.
1022       return i;
1023   }
1024
1025   // We were left off at dst_size.  We over copied 1 byte.  Null terminate.
1026   if (dst_size != 0)
1027     dst[dst_size - 1] = 0;
1028
1029   // Count the rest of the |src|, and return it's length in characters.
1030   while (src[dst_size]) ++dst_size;
1031   return dst_size;
1032 }
1033
1034 }  // namespace
1035
1036 size_t base::strlcpy(char* dst, const char* src, size_t dst_size) {
1037   return lcpyT<char>(dst, src, dst_size);
1038 }
1039 size_t base::wcslcpy(wchar_t* dst, const wchar_t* src, size_t dst_size) {
1040   return lcpyT<wchar_t>(dst, src, dst_size);
1041 }