base/strings/string_util.cc

   1 // Copyright 2013 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #include "base/strings/string_util.h"
   6
   7 #include <ctype.h>
   8 #include <errno.h>
   9 #include <math.h>
  10 #include <stdarg.h>
  11 #include <stdio.h>
  12 #include <stdlib.h>
  13 #include <string.h>
  14 #include <time.h>
  15 #include <wchar.h>
  16 #include <wctype.h>
  17
  18 #include <algorithm>
  19 #include <vector>
  20
  21 #include "base/basictypes.h"
  22 #include "base/logging.h"
  23 #include "base/memory/singleton.h"
  24 #include "base/strings/utf_string_conversion_utils.h"
  25 #include "base/strings/utf_string_conversions.h"
  26 #include "base/third_party/icu/icu_utf.h"
  27 #include "build/build_config.h"
  28
  29 // Remove when this entire file is in the base namespace.
  30 using base::char16;
  31 using base::string16;
  32
  33 namespace {
  34
  35 // Force the singleton used by EmptyString[16] to be a unique type. This
  36 // prevents other code that might accidentally use Singleton<string> from
  37 // getting our internal one.
  38 struct EmptyStrings {
  39   EmptyStrings() {}
  40   const std::string s;
  41   const string16 s16;
  42
  43   static EmptyStrings* GetInstance() {
  44     return Singleton<EmptyStrings>::get();
  45   }
  46 };
  47
  48 // Used by ReplaceStringPlaceholders to track the position in the string of
  49 // replaced parameters.
  50 struct ReplacementOffset {
  51   ReplacementOffset(uintptr_t parameter, size_t offset)
  52       : parameter(parameter),
  53         offset(offset) {}
  54
  55   // Index of the parameter.
  56   uintptr_t parameter;
  57
  58   // Starting position in the string.
  59   size_t offset;
  60 };
  61
  62 static bool CompareParameter(const ReplacementOffset& elem1,
  63                              const ReplacementOffset& elem2) {
  64   return elem1.parameter < elem2.parameter;
  65 }
  66
  67 // Assuming that a pointer is the size of a "machine word", then
  68 // uintptr_t is an integer type that is also a machine word.
  69 typedef uintptr_t MachineWord;
  70 const uintptr_t kMachineWordAlignmentMask = sizeof(MachineWord) - 1;
  71
  72 inline bool IsAlignedToMachineWord(const void* pointer) {
  73   return !(reinterpret_cast<MachineWord>(pointer) & kMachineWordAlignmentMask);
  74 }
  75
  76 template<typename T> inline T* AlignToMachineWord(T* pointer) {
  77   return reinterpret_cast<T*>(reinterpret_cast<MachineWord>(pointer) &
  78                               ~kMachineWordAlignmentMask);
  79 }
  80
  81 template<size_t size, typename CharacterType> struct NonASCIIMask;
  82 template<> struct NonASCIIMask<4, base::char16> {
  83     static inline uint32_t value() { return 0xFF80FF80U; }
  84 };
  85 template<> struct NonASCIIMask<4, char> {
  86     static inline uint32_t value() { return 0x80808080U; }
  87 };
  88 template<> struct NonASCIIMask<8, base::char16> {
  89     static inline uint64_t value() { return 0xFF80FF80FF80FF80ULL; }
  90 };
  91 template<> struct NonASCIIMask<8, char> {
  92     static inline uint64_t value() { return 0x8080808080808080ULL; }
  93 };
  94 #if defined(WCHAR_T_IS_UTF32)
  95 template<> struct NonASCIIMask<4, wchar_t> {
  96     static inline uint32_t value() { return 0xFFFFFF80U; }
  97 };
  98 template<> struct NonASCIIMask<8, wchar_t> {
  99     static inline uint64_t value() { return 0xFFFFFF80FFFFFF80ULL; }
 100 };
 101 #endif  // WCHAR_T_IS_UTF32
 102
 103 }  // namespace
 104
 105 namespace base {
 106
 107 bool IsWprintfFormatPortable(const wchar_t* format) {
 108   for (const wchar_t* position = format; *position != '\0'; ++position) {
 109     if (*position == '%') {
 110       bool in_specification = true;
 111       bool modifier_l = false;
 112       while (in_specification) {
 113         // Eat up characters until reaching a known specifier.
 114         if (*++position == '\0') {
 115           // The format string ended in the middle of a specification.  Call
 116           // it portable because no unportable specifications were found.  The
 117           // string is equally broken on all platforms.
 118           return true;
 119         }
 120
 121         if (*position == 'l') {
 122           // 'l' is the only thing that can save the 's' and 'c' specifiers.
 123           modifier_l = true;
 124         } else if (((*position == 's' || *position == 'c') && !modifier_l) ||
 125                    *position == 'S' || *position == 'C' || *position == 'F' ||
 126                    *position == 'D' || *position == 'O' || *position == 'U') {
 127           // Not portable.
 128           return false;
 129         }
 130
 131         if (wcschr(L"diouxXeEfgGaAcspn%", *position)) {
 132           // Portable, keep scanning the rest of the format string.
 133           in_specification = false;
 134         }
 135       }
 136     }
 137   }
 138
 139   return true;
 140 }
 141
 142 const std::string& EmptyString() {
 143   return EmptyStrings::GetInstance()->s;
 144 }
 145
 146 const string16& EmptyString16() {
 147   return EmptyStrings::GetInstance()->s16;
 148 }
 149
 150 template<typename STR>
 151 bool ReplaceCharsT(const STR& input,
 152                    const STR& replace_chars,
 153                    const STR& replace_with,
 154                    STR* output) {
 155   bool removed = false;
 156   size_t replace_length = replace_with.length();
 157
 158   *output = input;
 159
 160   size_t found = output->find_first_of(replace_chars);
 161   while (found != STR::npos) {
 162     removed = true;
 163     output->replace(found, 1, replace_with);
 164     found = output->find_first_of(replace_chars, found + replace_length);
 165   }
 166
 167   return removed;
 168 }
 169
 170 bool ReplaceChars(const string16& input,
 171                   const base::StringPiece16& replace_chars,
 172                   const string16& replace_with,
 173                   string16* output) {
 174   return ReplaceCharsT(input, replace_chars.as_string(), replace_with, output);
 175 }
 176
 177 bool ReplaceChars(const std::string& input,
 178                   const base::StringPiece& replace_chars,
 179                   const std::string& replace_with,
 180                   std::string* output) {
 181   return ReplaceCharsT(input, replace_chars.as_string(), replace_with, output);
 182 }
 183
 184 bool RemoveChars(const string16& input,
 185                  const base::StringPiece16& remove_chars,
 186                  string16* output) {
 187   return ReplaceChars(input, remove_chars.as_string(), string16(), output);
 188 }
 189
 190 bool RemoveChars(const std::string& input,
 191                  const base::StringPiece& remove_chars,
 192                  std::string* output) {
 193   return ReplaceChars(input, remove_chars.as_string(), std::string(), output);
 194 }
 195
 196 template<typename STR>
 197 TrimPositions TrimStringT(const STR& input,
 198                           const STR& trim_chars,
 199                           TrimPositions positions,
 200                           STR* output) {
 201   // Find the edges of leading/trailing whitespace as desired.
 202   const size_t last_char = input.length() - 1;
 203   const size_t first_good_char = (positions & TRIM_LEADING) ?
 204       input.find_first_not_of(trim_chars) : 0;
 205   const size_t last_good_char = (positions & TRIM_TRAILING) ?
 206       input.find_last_not_of(trim_chars) : last_char;
 207
 208   // When the string was all whitespace, report that we stripped off whitespace
 209   // from whichever position the caller was interested in.  For empty input, we
 210   // stripped no whitespace, but we still need to clear |output|.
 211   if (input.empty() ||
 212       (first_good_char == STR::npos) || (last_good_char == STR::npos)) {
 213     bool input_was_empty = input.empty();  // in case output == &input
 214     output->clear();
 215     return input_was_empty ? TRIM_NONE : positions;
 216   }
 217
 218   // Trim the whitespace.
 219   *output =
 220       input.substr(first_good_char, last_good_char - first_good_char + 1);
 221
 222   // Return where we trimmed from.
 223   return static_cast<TrimPositions>(
 224       ((first_good_char == 0) ? TRIM_NONE : TRIM_LEADING) |
 225       ((last_good_char == last_char) ? TRIM_NONE : TRIM_TRAILING));
 226 }
 227
 228 bool TrimString(const string16& input,
 229                 const base::StringPiece16& trim_chars,
 230                 string16* output) {
 231   return TrimStringT(input, trim_chars.as_string(), TRIM_ALL, output) !=
 232       TRIM_NONE;
 233 }
 234
 235 bool TrimString(const std::string& input,
 236                 const base::StringPiece& trim_chars,
 237                 std::string* output) {
 238   return TrimStringT(input, trim_chars.as_string(), TRIM_ALL, output) !=
 239       TRIM_NONE;
 240 }
 241
 242 void TruncateUTF8ToByteSize(const std::string& input,
 243                             const size_t byte_size,
 244                             std::string* output) {
 245   DCHECK(output);
 246   if (byte_size > input.length()) {
 247     *output = input;
 248     return;
 249   }
 250   DCHECK_LE(byte_size, static_cast<uint32>(kint32max));
 251   // Note: This cast is necessary because CBU8_NEXT uses int32s.
 252   int32 truncation_length = static_cast<int32>(byte_size);
 253   int32 char_index = truncation_length - 1;
 254   const char* data = input.data();
 255
 256   // Using CBU8, we will move backwards from the truncation point
 257   // to the beginning of the string looking for a valid UTF8
 258   // character.  Once a full UTF8 character is found, we will
 259   // truncate the string to the end of that character.
 260   while (char_index >= 0) {
 261     int32 prev = char_index;
 262     base_icu::UChar32 code_point = 0;
 263     CBU8_NEXT(data, char_index, truncation_length, code_point);
 264     if (!IsValidCharacter(code_point) ||
 265         !IsValidCodepoint(code_point)) {
 266       char_index = prev - 1;
 267     } else {
 268       break;
 269     }
 270   }
 271
 272   if (char_index >= 0 )
 273     *output = input.substr(0, char_index);
 274   else
 275     output->clear();
 276 }
 277
 278 TrimPositions TrimWhitespace(const string16& input,
 279                              TrimPositions positions,
 280                              string16* output) {
 281   return TrimStringT(input, base::string16(kWhitespaceUTF16), positions,
 282                      output);
 283 }
 284
 285 TrimPositions TrimWhitespaceASCII(const std::string& input,
 286                                   TrimPositions positions,
 287                                   std::string* output) {
 288   return TrimStringT(input, std::string(kWhitespaceASCII), positions, output);
 289 }
 290
 291 // This function is only for backward-compatibility.
 292 // To be removed when all callers are updated.
 293 TrimPositions TrimWhitespace(const std::string& input,
 294                              TrimPositions positions,
 295                              std::string* output) {
 296   return TrimWhitespaceASCII(input, positions, output);
 297 }
 298
 299 template<typename STR>
 300 STR CollapseWhitespaceT(const STR& text,
 301                         bool trim_sequences_with_line_breaks) {
 302   STR result;
 303   result.resize(text.size());
 304
 305   // Set flags to pretend we're already in a trimmed whitespace sequence, so we
 306   // will trim any leading whitespace.
 307   bool in_whitespace = true;
 308   bool already_trimmed = true;
 309
 310   int chars_written = 0;
 311   for (typename STR::const_iterator i(text.begin()); i != text.end(); ++i) {
 312     if (IsWhitespace(*i)) {
 313       if (!in_whitespace) {
 314         // Reduce all whitespace sequences to a single space.
 315         in_whitespace = true;
 316         result[chars_written++] = L' ';
 317       }
 318       if (trim_sequences_with_line_breaks && !already_trimmed &&
 319           ((*i == '\n') || (*i == '\r'))) {
 320         // Whitespace sequences containing CR or LF are eliminated entirely.
 321         already_trimmed = true;
 322         --chars_written;
 323       }
 324     } else {
 325       // Non-whitespace chracters are copied straight across.
 326       in_whitespace = false;
 327       already_trimmed = false;
 328       result[chars_written++] = *i;
 329     }
 330   }
 331
 332   if (in_whitespace && !already_trimmed) {
 333     // Any trailing whitespace is eliminated.
 334     --chars_written;
 335   }
 336
 337   result.resize(chars_written);
 338   return result;
 339 }
 340
 341 string16 CollapseWhitespace(const string16& text,
 342                             bool trim_sequences_with_line_breaks) {
 343   return CollapseWhitespaceT(text, trim_sequences_with_line_breaks);
 344 }
 345
 346 std::string CollapseWhitespaceASCII(const std::string& text,
 347                                     bool trim_sequences_with_line_breaks) {
 348   return CollapseWhitespaceT(text, trim_sequences_with_line_breaks);
 349 }
 350
 351 bool ContainsOnlyChars(const StringPiece& input,
 352                        const StringPiece& characters) {
 353   return input.find_first_not_of(characters) == StringPiece::npos;
 354 }
 355
 356 bool ContainsOnlyChars(const StringPiece16& input,
 357                        const StringPiece16& characters) {
 358   return input.find_first_not_of(characters) == StringPiece16::npos;
 359 }
 360
 361 template <class Char>
 362 inline bool DoIsStringASCII(const Char* characters, size_t length) {
 363   MachineWord all_char_bits = 0;
 364   const Char* end = characters + length;
 365
 366   // Prologue: align the input.
 367   while (!IsAlignedToMachineWord(characters) && characters != end) {
 368     all_char_bits |= *characters;
 369     ++characters;
 370   }
 371
 372   // Compare the values of CPU word size.
 373   const Char* word_end = AlignToMachineWord(end);
 374   const size_t loop_increment = sizeof(MachineWord) / sizeof(Char);
 375   while (characters < word_end) {
 376     all_char_bits |= *(reinterpret_cast<const MachineWord*>(characters));
 377     characters += loop_increment;
 378   }
 379
 380   // Process the remaining bytes.
 381   while (characters != end) {
 382     all_char_bits |= *characters;
 383     ++characters;
 384   }
 385
 386   MachineWord non_ascii_bit_mask =
 387       NonASCIIMask<sizeof(MachineWord), Char>::value();
 388   return !(all_char_bits & non_ascii_bit_mask);
 389 }
 390
 391 bool IsStringASCII(const StringPiece& str) {
 392   return DoIsStringASCII(str.data(), str.length());
 393 }
 394
 395 bool IsStringASCII(const StringPiece16& str) {
 396   return DoIsStringASCII(str.data(), str.length());
 397 }
 398
 399 bool IsStringASCII(const string16& str) {
 400   return DoIsStringASCII(str.data(), str.length());
 401 }
 402
 403 #if defined(WCHAR_T_IS_UTF32)
 404 bool IsStringASCII(const std::wstring& str) {
 405   return DoIsStringASCII(str.data(), str.length());
 406 }
 407 #endif
 408
 409 bool IsStringUTF8(const StringPiece& str) {
 410   const char *src = str.data();
 411   int32 src_len = static_cast<int32>(str.length());
 412   int32 char_index = 0;
 413
 414   while (char_index < src_len) {
 415     int32 code_point;
 416     CBU8_NEXT(src, char_index, src_len, code_point);
 417     if (!IsValidCharacter(code_point))
 418       return false;
 419   }
 420   return true;
 421 }
 422
 423 }  // namespace base
 424
 425 template<typename Iter>
 426 static inline bool DoLowerCaseEqualsASCII(Iter a_begin,
 427                                           Iter a_end,
 428                                           const char* b) {
 429   for (Iter it = a_begin; it != a_end; ++it, ++b) {
 430     if (!*b || base::ToLowerASCII(*it) != *b)
 431       return false;
 432   }
 433   return *b == 0;
 434 }
 435
 436 // Front-ends for LowerCaseEqualsASCII.
 437 bool LowerCaseEqualsASCII(const std::string& a, const char* b) {
 438   return DoLowerCaseEqualsASCII(a.begin(), a.end(), b);
 439 }
 440
 441 bool LowerCaseEqualsASCII(const string16& a, const char* b) {
 442   return DoLowerCaseEqualsASCII(a.begin(), a.end(), b);
 443 }
 444
 445 bool LowerCaseEqualsASCII(std::string::const_iterator a_begin,
 446                           std::string::const_iterator a_end,
 447                           const char* b) {
 448   return DoLowerCaseEqualsASCII(a_begin, a_end, b);
 449 }
 450
 451 bool LowerCaseEqualsASCII(string16::const_iterator a_begin,
 452                           string16::const_iterator a_end,
 453                           const char* b) {
 454   return DoLowerCaseEqualsASCII(a_begin, a_end, b);
 455 }
 456
 457 // TODO(port): Resolve wchar_t/iterator issues that require OS_ANDROID here.
 458 #if !defined(OS_ANDROID)
 459 bool LowerCaseEqualsASCII(const char* a_begin,
 460                           const char* a_end,
 461                           const char* b) {
 462   return DoLowerCaseEqualsASCII(a_begin, a_end, b);
 463 }
 464
 465 bool LowerCaseEqualsASCII(const char16* a_begin,
 466                           const char16* a_end,
 467                           const char* b) {
 468   return DoLowerCaseEqualsASCII(a_begin, a_end, b);
 469 }
 470
 471 #endif  // !defined(OS_ANDROID)
 472
 473 bool EqualsASCII(const string16& a, const base::StringPiece& b) {
 474   if (a.length() != b.length())
 475     return false;
 476   return std::equal(b.begin(), b.end(), a.begin());
 477 }
 478
 479 bool StartsWithASCII(const std::string& str,
 480                      const std::string& search,
 481                      bool case_sensitive) {
 482   if (case_sensitive)
 483     return str.compare(0, search.length(), search) == 0;
 484   else
 485     return base::strncasecmp(str.c_str(), search.c_str(), search.length()) == 0;
 486 }
 487
 488 template <typename STR>
 489 bool StartsWithT(const STR& str, const STR& search, bool case_sensitive) {
 490   if (case_sensitive) {
 491     return str.compare(0, search.length(), search) == 0;
 492   } else {
 493     if (search.size() > str.size())
 494       return false;
 495     return std::equal(search.begin(), search.end(), str.begin(),
 496                       base::CaseInsensitiveCompare<typename STR::value_type>());
 497   }
 498 }
 499
 500 bool StartsWith(const string16& str, const string16& search,
 501                 bool case_sensitive) {
 502   return StartsWithT(str, search, case_sensitive);
 503 }
 504
 505 template <typename STR>
 506 bool EndsWithT(const STR& str, const STR& search, bool case_sensitive) {
 507   size_t str_length = str.length();
 508   size_t search_length = search.length();
 509   if (search_length > str_length)
 510     return false;
 511   if (case_sensitive)
 512     return str.compare(str_length - search_length, search_length, search) == 0;
 513   return std::equal(search.begin(), search.end(),
 514                     str.begin() + (str_length - search_length),
 515                     base::CaseInsensitiveCompare<typename STR::value_type>());
 516 }
 517
 518 bool EndsWith(const std::string& str, const std::string& search,
 519               bool case_sensitive) {
 520   return EndsWithT(str, search, case_sensitive);
 521 }
 522
 523 bool EndsWith(const string16& str, const string16& search,
 524               bool case_sensitive) {
 525   return EndsWithT(str, search, case_sensitive);
 526 }
 527
 528 static const char* const kByteStringsUnlocalized[] = {
 529   " B",
 530   " kB",
 531   " MB",
 532   " GB",
 533   " TB",
 534   " PB"
 535 };
 536
 537 string16 FormatBytesUnlocalized(int64 bytes) {
 538   double unit_amount = static_cast<double>(bytes);
 539   size_t dimension = 0;
 540   const int kKilo = 1024;
 541   while (unit_amount >= kKilo &&
 542          dimension < arraysize(kByteStringsUnlocalized) - 1) {
 543     unit_amount /= kKilo;
 544     dimension++;
 545   }
 546
 547   char buf[64];
 548   if (bytes != 0 && dimension > 0 && unit_amount < 100) {
 549     base::snprintf(buf, arraysize(buf), "%.1lf%s", unit_amount,
 550                    kByteStringsUnlocalized[dimension]);
 551   } else {
 552     base::snprintf(buf, arraysize(buf), "%.0lf%s", unit_amount,
 553                    kByteStringsUnlocalized[dimension]);
 554   }
 555
 556   return base::ASCIIToUTF16(buf);
 557 }
 558
 559 // Runs in O(n) time in the length of |str|.
 560 template<class StringType>
 561 void DoReplaceSubstringsAfterOffset(StringType* str,
 562                                     size_t offset,
 563                                     const StringType& find_this,
 564                                     const StringType& replace_with,
 565                                     bool replace_all) {
 566   DCHECK(!find_this.empty());
 567
 568   // If the find string doesn't appear, there's nothing to do.
 569   offset = str->find(find_this, offset);
 570   if (offset == StringType::npos)
 571     return;
 572
 573   // If we're only replacing one instance, there's no need to do anything
 574   // complicated.
 575   size_t find_length = find_this.length();
 576   if (!replace_all) {
 577     str->replace(offset, find_length, replace_with);
 578     return;
 579   }
 580
 581   // If the find and replace strings are the same length, we can simply use
 582   // replace() on each instance, and finish the entire operation in O(n) time.
 583   size_t replace_length = replace_with.length();
 584   if (find_length == replace_length) {
 585     do {
 586       str->replace(offset, find_length, replace_with);
 587       offset = str->find(find_this, offset + replace_length);
 588     } while (offset != StringType::npos);
 589     return;
 590   }
 591
 592   // Since the find and replace strings aren't the same length, a loop like the
 593   // one above would be O(n^2) in the worst case, as replace() will shift the
 594   // entire remaining string each time.  We need to be more clever to keep
 595   // things O(n).
 596   //
 597   // If we're shortening the string, we can alternate replacements with shifting
 598   // forward the intervening characters using memmove().
 599   size_t str_length = str->length();
 600   if (find_length > replace_length) {
 601     size_t write_offset = offset;
 602     do {
 603       if (replace_length) {
 604         str->replace(write_offset, replace_length, replace_with);
 605         write_offset += replace_length;
 606       }
 607       size_t read_offset = offset + find_length;
 608       offset = std::min(str->find(find_this, read_offset), str_length);
 609       size_t length = offset - read_offset;
 610       if (length) {
 611         memmove(&(*str)[write_offset], &(*str)[read_offset],
 612                 length * sizeof(typename StringType::value_type));
 613         write_offset += length;
 614       }
 615     } while (offset < str_length);
 616     str->resize(write_offset);
 617     return;
 618   }
 619
 620   // We're lengthening the string.  We can use alternating replacements and
 621   // memmove() calls like above, but we need to precalculate the final string
 622   // length and then expand from back-to-front to avoid overwriting the string
 623   // as we're reading it, needing to shift, or having to copy to a second string
 624   // temporarily.
 625   size_t first_match = offset;
 626
 627   // First, calculate the final length and resize the string.
 628   size_t final_length = str_length;
 629   size_t expansion = replace_length - find_length;
 630   size_t current_match;
 631   do {
 632     final_length += expansion;
 633     // Minor optimization: save this offset into |current_match|, so that on
 634     // exit from the loop, |current_match| will point at the last instance of
 635     // the find string, and we won't need to find() it again immediately.
 636     current_match = offset;
 637     offset = str->find(find_this, offset + find_length);
 638   } while (offset != StringType::npos);
 639   str->resize(final_length);
 640
 641   // Now do the replacement loop, working backwards through the string.
 642   for (size_t prev_match = str_length, write_offset = final_length; ;
 643        current_match = str->rfind(find_this, current_match - 1)) {
 644     size_t read_offset = current_match + find_length;
 645     size_t length = prev_match - read_offset;
 646     if (length) {
 647       write_offset -= length;
 648       memmove(&(*str)[write_offset], &(*str)[read_offset],
 649               length * sizeof(typename StringType::value_type));
 650     }
 651     write_offset -= replace_length;
 652     str->replace(write_offset, replace_length, replace_with);
 653     if (current_match == first_match)
 654       return;
 655     prev_match = current_match;
 656   }
 657 }
 658
 659 void ReplaceFirstSubstringAfterOffset(string16* str,
 660                                       size_t start_offset,
 661                                       const string16& find_this,
 662                                       const string16& replace_with) {
 663   DoReplaceSubstringsAfterOffset(str, start_offset, find_this, replace_with,
 664                                  false);  // replace first instance
 665 }
 666
 667 void ReplaceFirstSubstringAfterOffset(std::string* str,
 668                                       size_t start_offset,
 669                                       const std::string& find_this,
 670                                       const std::string& replace_with) {
 671   DoReplaceSubstringsAfterOffset(str, start_offset, find_this, replace_with,
 672                                  false);  // replace first instance
 673 }
 674
 675 void ReplaceSubstringsAfterOffset(string16* str,
 676                                   size_t start_offset,
 677                                   const string16& find_this,
 678                                   const string16& replace_with) {
 679   DoReplaceSubstringsAfterOffset(str, start_offset, find_this, replace_with,
 680                                  true);  // replace all instances
 681 }
 682
 683 void ReplaceSubstringsAfterOffset(std::string* str,
 684                                   size_t start_offset,
 685                                   const std::string& find_this,
 686                                   const std::string& replace_with) {
 687   DoReplaceSubstringsAfterOffset(str, start_offset, find_this, replace_with,
 688                                  true);  // replace all instances
 689 }
 690
 691
 692 template<typename STR>
 693 static size_t TokenizeT(const STR& str,
 694                         const STR& delimiters,
 695                         std::vector<STR>* tokens) {
 696   tokens->clear();
 697
 698   size_t start = str.find_first_not_of(delimiters);
 699   while (start != STR::npos) {
 700     size_t end = str.find_first_of(delimiters, start + 1);
 701     if (end == STR::npos) {
 702       tokens->push_back(str.substr(start));
 703       break;
 704     } else {
 705       tokens->push_back(str.substr(start, end - start));
 706       start = str.find_first_not_of(delimiters, end + 1);
 707     }
 708   }
 709
 710   return tokens->size();
 711 }
 712
 713 size_t Tokenize(const string16& str,
 714                 const string16& delimiters,
 715                 std::vector<string16>* tokens) {
 716   return TokenizeT(str, delimiters, tokens);
 717 }
 718
 719 size_t Tokenize(const std::string& str,
 720                 const std::string& delimiters,
 721                 std::vector<std::string>* tokens) {
 722   return TokenizeT(str, delimiters, tokens);
 723 }
 724
 725 size_t Tokenize(const base::StringPiece& str,
 726                 const base::StringPiece& delimiters,
 727                 std::vector<base::StringPiece>* tokens) {
 728   return TokenizeT(str, delimiters, tokens);
 729 }
 730
 731 template<typename STR>
 732 static STR JoinStringT(const std::vector<STR>& parts, const STR& sep) {
 733   if (parts.empty())
 734     return STR();
 735
 736   STR result(parts[0]);
 737   typename std::vector<STR>::const_iterator iter = parts.begin();
 738   ++iter;
 739
 740   for (; iter != parts.end(); ++iter) {
 741     result += sep;
 742     result += *iter;
 743   }
 744
 745   return result;
 746 }
 747
 748 std::string JoinString(const std::vector<std::string>& parts, char sep) {
 749   return JoinStringT(parts, std::string(1, sep));
 750 }
 751
 752 string16 JoinString(const std::vector<string16>& parts, char16 sep) {
 753   return JoinStringT(parts, string16(1, sep));
 754 }
 755
 756 std::string JoinString(const std::vector<std::string>& parts,
 757                        const std::string& separator) {
 758   return JoinStringT(parts, separator);
 759 }
 760
 761 string16 JoinString(const std::vector<string16>& parts,
 762                     const string16& separator) {
 763   return JoinStringT(parts, separator);
 764 }
 765
 766 template<class FormatStringType, class OutStringType>
 767 OutStringType DoReplaceStringPlaceholders(const FormatStringType& format_string,
 768     const std::vector<OutStringType>& subst, std::vector<size_t>* offsets) {
 769   size_t substitutions = subst.size();
 770
 771   size_t sub_length = 0;
 772   for (typename std::vector<OutStringType>::const_iterator iter = subst.begin();
 773        iter != subst.end(); ++iter) {
 774     sub_length += iter->length();
 775   }
 776
 777   OutStringType formatted;
 778   formatted.reserve(format_string.length() + sub_length);
 779
 780   std::vector<ReplacementOffset> r_offsets;
 781   for (typename FormatStringType::const_iterator i = format_string.begin();
 782        i != format_string.end(); ++i) {
 783     if ('$' == *i) {
 784       if (i + 1 != format_string.end()) {
 785         ++i;
 786         DCHECK('$' == *i || '1' <= *i) << "Invalid placeholder: " << *i;
 787         if ('$' == *i) {
 788           while (i != format_string.end() && '$' == *i) {
 789             formatted.push_back('$');
 790             ++i;
 791           }
 792           --i;
 793         } else {
 794           uintptr_t index = 0;
 795           while (i != format_string.end() && '0' <= *i && *i <= '9') {
 796             index *= 10;
 797             index += *i - '0';
 798             ++i;
 799           }
 800           --i;
 801           index -= 1;
 802           if (offsets) {
 803             ReplacementOffset r_offset(index,
 804                 static_cast<int>(formatted.size()));
 805             r_offsets.insert(std::lower_bound(r_offsets.begin(),
 806                                               r_offsets.end(),
 807                                               r_offset,
 808                                               &CompareParameter),
 809                              r_offset);
 810           }
 811           if (index < substitutions)
 812             formatted.append(subst.at(index));
 813         }
 814       }
 815     } else {
 816       formatted.push_back(*i);
 817     }
 818   }
 819   if (offsets) {
 820     for (std::vector<ReplacementOffset>::const_iterator i = r_offsets.begin();
 821          i != r_offsets.end(); ++i) {
 822       offsets->push_back(i->offset);
 823     }
 824   }
 825   return formatted;
 826 }
 827
 828 string16 ReplaceStringPlaceholders(const string16& format_string,
 829                                    const std::vector<string16>& subst,
 830                                    std::vector<size_t>* offsets) {
 831   return DoReplaceStringPlaceholders(format_string, subst, offsets);
 832 }
 833
 834 std::string ReplaceStringPlaceholders(const base::StringPiece& format_string,
 835                                       const std::vector<std::string>& subst,
 836                                       std::vector<size_t>* offsets) {
 837   return DoReplaceStringPlaceholders(format_string, subst, offsets);
 838 }
 839
 840 string16 ReplaceStringPlaceholders(const string16& format_string,
 841                                    const string16& a,
 842                                    size_t* offset) {
 843   std::vector<size_t> offsets;
 844   std::vector<string16> subst;
 845   subst.push_back(a);
 846   string16 result = ReplaceStringPlaceholders(format_string, subst, &offsets);
 847
 848   DCHECK_EQ(1U, offsets.size());
 849   if (offset)
 850     *offset = offsets[0];
 851   return result;
 852 }
 853
 854 static bool IsWildcard(base_icu::UChar32 character) {
 855   return character == '*' || character == '?';
 856 }
 857
 858 // Move the strings pointers to the point where they start to differ.
 859 template <typename CHAR, typename NEXT>
 860 static void EatSameChars(const CHAR** pattern, const CHAR* pattern_end,
 861                          const CHAR** string, const CHAR* string_end,
 862                          NEXT next) {
 863   const CHAR* escape = NULL;
 864   while (*pattern != pattern_end && *string != string_end) {
 865     if (!escape && IsWildcard(**pattern)) {
 866       // We don't want to match wildcard here, except if it's escaped.
 867       return;
 868     }
 869
 870     // Check if the escapement char is found. If so, skip it and move to the
 871     // next character.
 872     if (!escape && **pattern == '\\') {
 873       escape = *pattern;
 874       next(pattern, pattern_end);
 875       continue;
 876     }
 877
 878     // Check if the chars match, if so, increment the ptrs.
 879     const CHAR* pattern_next = *pattern;
 880     const CHAR* string_next = *string;
 881     base_icu::UChar32 pattern_char = next(&pattern_next, pattern_end);
 882     if (pattern_char == next(&string_next, string_end) &&
 883         pattern_char != CBU_SENTINEL) {
 884       *pattern = pattern_next;
 885       *string = string_next;
 886     } else {
 887       // Uh oh, it did not match, we are done. If the last char was an
 888       // escapement, that means that it was an error to advance the ptr here,
 889       // let's put it back where it was. This also mean that the MatchPattern
 890       // function will return false because if we can't match an escape char
 891       // here, then no one will.
 892       if (escape) {
 893         *pattern = escape;
 894       }
 895       return;
 896     }
 897
 898     escape = NULL;
 899   }
 900 }
 901
 902 template <typename CHAR, typename NEXT>
 903 static void EatWildcard(const CHAR** pattern, const CHAR* end, NEXT next) {
 904   while (*pattern != end) {
 905     if (!IsWildcard(**pattern))
 906       return;
 907     next(pattern, end);
 908   }
 909 }
 910
 911 template <typename CHAR, typename NEXT>
 912 static bool MatchPatternT(const CHAR* eval, const CHAR* eval_end,
 913                           const CHAR* pattern, const CHAR* pattern_end,
 914                           int depth,
 915                           NEXT next) {
 916   const int kMaxDepth = 16;
 917   if (depth > kMaxDepth)
 918     return false;
 919
 920   // Eat all the matching chars.
 921   EatSameChars(&pattern, pattern_end, &eval, eval_end, next);
 922
 923   // If the string is empty, then the pattern must be empty too, or contains
 924   // only wildcards.
 925   if (eval == eval_end) {
 926     EatWildcard(&pattern, pattern_end, next);
 927     return pattern == pattern_end;
 928   }
 929
 930   // Pattern is empty but not string, this is not a match.
 931   if (pattern == pattern_end)
 932     return false;
 933
 934   // If this is a question mark, then we need to compare the rest with
 935   // the current string or the string with one character eaten.
 936   const CHAR* next_pattern = pattern;
 937   next(&next_pattern, pattern_end);
 938   if (pattern[0] == '?') {
 939     if (MatchPatternT(eval, eval_end, next_pattern, pattern_end,
 940                       depth + 1, next))
 941       return true;
 942     const CHAR* next_eval = eval;
 943     next(&next_eval, eval_end);
 944     if (MatchPatternT(next_eval, eval_end, next_pattern, pattern_end,
 945                       depth + 1, next))
 946       return true;
 947   }
 948
 949   // This is a *, try to match all the possible substrings with the remainder
 950   // of the pattern.
 951   if (pattern[0] == '*') {
 952     // Collapse duplicate wild cards (********** into *) so that the
 953     // method does not recurse unnecessarily. http://crbug.com/52839
 954     EatWildcard(&next_pattern, pattern_end, next);
 955
 956     while (eval != eval_end) {
 957       if (MatchPatternT(eval, eval_end, next_pattern, pattern_end,
 958                         depth + 1, next))
 959         return true;
 960       eval++;
 961     }
 962
 963     // We reached the end of the string, let see if the pattern contains only
 964     // wildcards.
 965     if (eval == eval_end) {
 966       EatWildcard(&pattern, pattern_end, next);
 967       if (pattern != pattern_end)
 968         return false;
 969       return true;
 970     }
 971   }
 972
 973   return false;
 974 }
 975
 976 struct NextCharUTF8 {
 977   base_icu::UChar32 operator()(const char** p, const char* end) {
 978     base_icu::UChar32 c;
 979     int offset = 0;
 980     CBU8_NEXT(*p, offset, end - *p, c);
 981     *p += offset;
 982     return c;
 983   }
 984 };
 985
 986 struct NextCharUTF16 {
 987   base_icu::UChar32 operator()(const char16** p, const char16* end) {
 988     base_icu::UChar32 c;
 989     int offset = 0;
 990     CBU16_NEXT(*p, offset, end - *p, c);
 991     *p += offset;
 992     return c;
 993   }
 994 };
 995
 996 bool MatchPattern(const base::StringPiece& eval,
 997                   const base::StringPiece& pattern) {
 998   return MatchPatternT(eval.data(), eval.data() + eval.size(),
 999                        pattern.data(), pattern.data() + pattern.size(),
1000                        0, NextCharUTF8());
1001 }
1002
1003 bool MatchPattern(const string16& eval, const string16& pattern) {
1004   return MatchPatternT(eval.c_str(), eval.c_str() + eval.size(),
1005                        pattern.c_str(), pattern.c_str() + pattern.size(),
1006                        0, NextCharUTF16());
1007 }
1008
1009 // The following code is compatible with the OpenBSD lcpy interface.  See:
1010 //   http://www.gratisoft.us/todd/papers/strlcpy.html
1011 //   ftp://ftp.openbsd.org/pub/OpenBSD/src/lib/libc/string/{wcs,str}lcpy.c
1012
1013 namespace {
1014
1015 template <typename CHAR>
1016 size_t lcpyT(CHAR* dst, const CHAR* src, size_t dst_size) {
1017   for (size_t i = 0; i < dst_size; ++i) {
1018     if ((dst[i] = src[i]) == 0)  // We hit and copied the terminating NULL.
1019       return i;
1020   }
1021
1022   // We were left off at dst_size.  We over copied 1 byte.  Null terminate.
1023   if (dst_size != 0)
1024     dst[dst_size - 1] = 0;
1025
1026   // Count the rest of the |src|, and return it's length in characters.
1027   while (src[dst_size]) ++dst_size;
1028   return dst_size;
1029 }
1030
1031 }  // namespace
1032
1033 size_t base::strlcpy(char* dst, const char* src, size_t dst_size) {
1034   return lcpyT<char>(dst, src, dst_size);
1035 }
1036 size_t base::wcslcpy(wchar_t* dst, const wchar_t* src, size_t dst_size) {
1037   return lcpyT<wchar_t>(dst, src, dst_size);
1038 }