base/strings/string_util.cc

   1 // Copyright 2013 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #include "base/strings/string_util.h"
   6
   7 #include <ctype.h>
   8 #include <errno.h>
   9 #include <math.h>
  10 #include <stdarg.h>
  11 #include <stdio.h>
  12 #include <stdlib.h>
  13 #include <string.h>
  14 #include <time.h>
  15 #include <wchar.h>
  16 #include <wctype.h>
  17
  18 #include <algorithm>
  19 #include <vector>
  20
  21 #include "base/basictypes.h"
  22 #include "base/logging.h"
  23 #include "base/memory/singleton.h"
  24 #include "base/strings/utf_string_conversion_utils.h"
  25 #include "base/strings/utf_string_conversions.h"
  26 #include "base/third_party/icu/icu_utf.h"
  27 #include "build/build_config.h"
  28
  29 // Remove when this entire file is in the base namespace.
  30 using base::char16;
  31 using base::string16;
  32
  33 namespace {
  34
  35 // Force the singleton used by EmptyString[16] to be a unique type. This
  36 // prevents other code that might accidentally use Singleton<string> from
  37 // getting our internal one.
  38 struct EmptyStrings {
  39   EmptyStrings() {}
  40   const std::string s;
  41   const string16 s16;
  42
  43   static EmptyStrings* GetInstance() {
  44     return Singleton<EmptyStrings>::get();
  45   }
  46 };
  47
  48 // Used by ReplaceStringPlaceholders to track the position in the string of
  49 // replaced parameters.
  50 struct ReplacementOffset {
  51   ReplacementOffset(uintptr_t parameter, size_t offset)
  52       : parameter(parameter),
  53         offset(offset) {}
  54
  55   // Index of the parameter.
  56   uintptr_t parameter;
  57
  58   // Starting position in the string.
  59   size_t offset;
  60 };
  61
  62 static bool CompareParameter(const ReplacementOffset& elem1,
  63                              const ReplacementOffset& elem2) {
  64   return elem1.parameter < elem2.parameter;
  65 }
  66
  67 // Assuming that a pointer is the size of a "machine word", then
  68 // uintptr_t is an integer type that is also a machine word.
  69 typedef uintptr_t MachineWord;
  70 const uintptr_t kMachineWordAlignmentMask = sizeof(MachineWord) - 1;
  71
  72 inline bool IsAlignedToMachineWord(const void* pointer) {
  73   return !(reinterpret_cast<MachineWord>(pointer) & kMachineWordAlignmentMask);
  74 }
  75
  76 template<typename T> inline T* AlignToMachineWord(T* pointer) {
  77   return reinterpret_cast<T*>(reinterpret_cast<MachineWord>(pointer) &
  78                               ~kMachineWordAlignmentMask);
  79 }
  80
  81 template<size_t size, typename CharacterType> struct NonASCIIMask;
  82 template<> struct NonASCIIMask<4, base::char16> {
  83     static inline uint32_t value() { return 0xFF80FF80U; }
  84 };
  85 template<> struct NonASCIIMask<4, char> {
  86     static inline uint32_t value() { return 0x80808080U; }
  87 };
  88 template<> struct NonASCIIMask<8, base::char16> {
  89     static inline uint64_t value() { return 0xFF80FF80FF80FF80ULL; }
  90 };
  91 template<> struct NonASCIIMask<8, char> {
  92     static inline uint64_t value() { return 0x8080808080808080ULL; }
  93 };
  94 #if defined(WCHAR_T_IS_UTF32)
  95 template<> struct NonASCIIMask<4, wchar_t> {
  96     static inline uint32_t value() { return 0xFFFFFF80U; }
  97 };
  98 template<> struct NonASCIIMask<8, wchar_t> {
  99     static inline uint64_t value() { return 0xFFFFFF80FFFFFF80ULL; }
 100 };
 101 #endif  // WCHAR_T_IS_UTF32
 102
 103 }  // namespace
 104
 105 namespace base {
 106
 107 bool IsWprintfFormatPortable(const wchar_t* format) {
 108   for (const wchar_t* position = format; *position != '\0'; ++position) {
 109     if (*position == '%') {
 110       bool in_specification = true;
 111       bool modifier_l = false;
 112       while (in_specification) {
 113         // Eat up characters until reaching a known specifier.
 114         if (*++position == '\0') {
 115           // The format string ended in the middle of a specification.  Call
 116           // it portable because no unportable specifications were found.  The
 117           // string is equally broken on all platforms.
 118           return true;
 119         }
 120
 121         if (*position == 'l') {
 122           // 'l' is the only thing that can save the 's' and 'c' specifiers.
 123           modifier_l = true;
 124         } else if (((*position == 's' || *position == 'c') && !modifier_l) ||
 125                    *position == 'S' || *position == 'C' || *position == 'F' ||
 126                    *position == 'D' || *position == 'O' || *position == 'U') {
 127           // Not portable.
 128           return false;
 129         }
 130
 131         if (wcschr(L"diouxXeEfgGaAcspn%", *position)) {
 132           // Portable, keep scanning the rest of the format string.
 133           in_specification = false;
 134         }
 135       }
 136     }
 137   }
 138
 139   return true;
 140 }
 141
 142 const std::string& EmptyString() {
 143   return EmptyStrings::GetInstance()->s;
 144 }
 145
 146 const string16& EmptyString16() {
 147   return EmptyStrings::GetInstance()->s16;
 148 }
 149
 150 template<typename STR>
 151 bool ReplaceCharsT(const STR& input,
 152                    const STR& replace_chars,
 153                    const STR& replace_with,
 154                    STR* output) {
 155   bool removed = false;
 156   size_t replace_length = replace_with.length();
 157
 158   *output = input;
 159
 160   size_t found = output->find_first_of(replace_chars);
 161   while (found != STR::npos) {
 162     removed = true;
 163     output->replace(found, 1, replace_with);
 164     found = output->find_first_of(replace_chars, found + replace_length);
 165   }
 166
 167   return removed;
 168 }
 169
 170 bool ReplaceChars(const string16& input,
 171                   const base::StringPiece16& replace_chars,
 172                   const string16& replace_with,
 173                   string16* output) {
 174   return ReplaceCharsT(input, replace_chars.as_string(), replace_with, output);
 175 }
 176
 177 bool ReplaceChars(const std::string& input,
 178                   const base::StringPiece& replace_chars,
 179                   const std::string& replace_with,
 180                   std::string* output) {
 181   return ReplaceCharsT(input, replace_chars.as_string(), replace_with, output);
 182 }
 183
 184 bool RemoveChars(const string16& input,
 185                  const base::StringPiece16& remove_chars,
 186                  string16* output) {
 187   return ReplaceChars(input, remove_chars.as_string(), string16(), output);
 188 }
 189
 190 bool RemoveChars(const std::string& input,
 191                  const base::StringPiece& remove_chars,
 192                  std::string* output) {
 193   return ReplaceChars(input, remove_chars.as_string(), std::string(), output);
 194 }
 195
 196 template<typename STR>
 197 TrimPositions TrimStringT(const STR& input,
 198                           const STR& trim_chars,
 199                           TrimPositions positions,
 200                           STR* output) {
 201   // Find the edges of leading/trailing whitespace as desired.
 202   const size_t last_char = input.length() - 1;
 203   const size_t first_good_char = (positions & TRIM_LEADING) ?
 204       input.find_first_not_of(trim_chars) : 0;
 205   const size_t last_good_char = (positions & TRIM_TRAILING) ?
 206       input.find_last_not_of(trim_chars) : last_char;
 207
 208   // When the string was all whitespace, report that we stripped off whitespace
 209   // from whichever position the caller was interested in.  For empty input, we
 210   // stripped no whitespace, but we still need to clear |output|.
 211   if (input.empty() ||
 212       (first_good_char == STR::npos) || (last_good_char == STR::npos)) {
 213     bool input_was_empty = input.empty();  // in case output == &input
 214     output->clear();
 215     return input_was_empty ? TRIM_NONE : positions;
 216   }
 217
 218   // Trim the whitespace.
 219   *output =
 220       input.substr(first_good_char, last_good_char - first_good_char + 1);
 221
 222   // Return where we trimmed from.
 223   return static_cast<TrimPositions>(
 224       ((first_good_char == 0) ? TRIM_NONE : TRIM_LEADING) |
 225       ((last_good_char == last_char) ? TRIM_NONE : TRIM_TRAILING));
 226 }
 227
 228 bool TrimString(const string16& input,
 229                 const base::StringPiece16& trim_chars,
 230                 string16* output) {
 231   return TrimStringT(input, trim_chars.as_string(), TRIM_ALL, output) !=
 232       TRIM_NONE;
 233 }
 234
 235 bool TrimString(const std::string& input,
 236                 const base::StringPiece& trim_chars,
 237                 std::string* output) {
 238   return TrimStringT(input, trim_chars.as_string(), TRIM_ALL, output) !=
 239       TRIM_NONE;
 240 }
 241
 242 void TruncateUTF8ToByteSize(const std::string& input,
 243                             const size_t byte_size,
 244                             std::string* output) {
 245   DCHECK(output);
 246   if (byte_size > input.length()) {
 247     *output = input;
 248     return;
 249   }
 250   DCHECK_LE(byte_size, static_cast<uint32>(kint32max));
 251   // Note: This cast is necessary because CBU8_NEXT uses int32s.
 252   int32 truncation_length = static_cast<int32>(byte_size);
 253   int32 char_index = truncation_length - 1;
 254   const char* data = input.data();
 255
 256   // Using CBU8, we will move backwards from the truncation point
 257   // to the beginning of the string looking for a valid UTF8
 258   // character.  Once a full UTF8 character is found, we will
 259   // truncate the string to the end of that character.
 260   while (char_index >= 0) {
 261     int32 prev = char_index;
 262     base_icu::UChar32 code_point = 0;
 263     CBU8_NEXT(data, char_index, truncation_length, code_point);
 264     if (!IsValidCharacter(code_point) ||
 265         !IsValidCodepoint(code_point)) {
 266       char_index = prev - 1;
 267     } else {
 268       break;
 269     }
 270   }
 271
 272   if (char_index >= 0 )
 273     *output = input.substr(0, char_index);
 274   else
 275     output->clear();
 276 }
 277
 278 TrimPositions TrimWhitespace(const string16& input,
 279                              TrimPositions positions,
 280                              string16* output) {
 281   return TrimStringT(input, base::string16(kWhitespaceUTF16), positions,
 282                      output);
 283 }
 284
 285 TrimPositions TrimWhitespaceASCII(const std::string& input,
 286                                   TrimPositions positions,
 287                                   std::string* output) {
 288   return TrimStringT(input, std::string(kWhitespaceASCII), positions, output);
 289 }
 290
 291 // This function is only for backward-compatibility.
 292 // To be removed when all callers are updated.
 293 TrimPositions TrimWhitespace(const std::string& input,
 294                              TrimPositions positions,
 295                              std::string* output) {
 296   return TrimWhitespaceASCII(input, positions, output);
 297 }
 298
 299 template<typename STR>
 300 STR CollapseWhitespaceT(const STR& text,
 301                         bool trim_sequences_with_line_breaks) {
 302   STR result;
 303   result.resize(text.size());
 304
 305   // Set flags to pretend we're already in a trimmed whitespace sequence, so we
 306   // will trim any leading whitespace.
 307   bool in_whitespace = true;
 308   bool already_trimmed = true;
 309
 310   int chars_written = 0;
 311   for (typename STR::const_iterator i(text.begin()); i != text.end(); ++i) {
 312     if (IsWhitespace(*i)) {
 313       if (!in_whitespace) {
 314         // Reduce all whitespace sequences to a single space.
 315         in_whitespace = true;
 316         result[chars_written++] = L' ';
 317       }
 318       if (trim_sequences_with_line_breaks && !already_trimmed &&
 319           ((*i == '\n') || (*i == '\r'))) {
 320         // Whitespace sequences containing CR or LF are eliminated entirely.
 321         already_trimmed = true;
 322         --chars_written;
 323       }
 324     } else {
 325       // Non-whitespace chracters are copied straight across.
 326       in_whitespace = false;
 327       already_trimmed = false;
 328       result[chars_written++] = *i;
 329     }
 330   }
 331
 332   if (in_whitespace && !already_trimmed) {
 333     // Any trailing whitespace is eliminated.
 334     --chars_written;
 335   }
 336
 337   result.resize(chars_written);
 338   return result;
 339 }
 340
 341 string16 CollapseWhitespace(const string16& text,
 342                             bool trim_sequences_with_line_breaks) {
 343   return CollapseWhitespaceT(text, trim_sequences_with_line_breaks);
 344 }
 345
 346 std::string CollapseWhitespaceASCII(const std::string& text,
 347                                     bool trim_sequences_with_line_breaks) {
 348   return CollapseWhitespaceT(text, trim_sequences_with_line_breaks);
 349 }
 350
 351 bool ContainsOnlyChars(const StringPiece& input,
 352                        const StringPiece& characters) {
 353   return input.find_first_not_of(characters) == StringPiece::npos;
 354 }
 355
 356 bool ContainsOnlyChars(const StringPiece16& input,
 357                        const StringPiece16& characters) {
 358   return input.find_first_not_of(characters) == StringPiece16::npos;
 359 }
 360
 361 template <class Char>
 362 inline bool DoIsStringASCII(const Char* characters, size_t length) {
 363   MachineWord all_char_bits = 0;
 364   const Char* end = characters + length;
 365
 366   // Prologue: align the input.
 367   while (!IsAlignedToMachineWord(characters) && characters != end) {
 368     all_char_bits |= *characters;
 369     ++characters;
 370   }
 371
 372   // Compare the values of CPU word size.
 373   const Char* word_end = AlignToMachineWord(end);
 374   const size_t loop_increment = sizeof(MachineWord) / sizeof(Char);
 375   while (characters < word_end) {
 376     all_char_bits |= *(reinterpret_cast<const MachineWord*>(characters));
 377     characters += loop_increment;
 378   }
 379
 380   // Process the remaining bytes.
 381   while (characters != end) {
 382     all_char_bits |= *characters;
 383     ++characters;
 384   }
 385
 386   MachineWord non_ascii_bit_mask =
 387       NonASCIIMask<sizeof(MachineWord), Char>::value();
 388   return !(all_char_bits & non_ascii_bit_mask);
 389 }
 390
 391 bool IsStringASCII(const StringPiece& str) {
 392   return DoIsStringASCII(str.data(), str.length());
 393 }
 394
 395 bool IsStringASCII(const StringPiece16& str) {
 396   return DoIsStringASCII(str.data(), str.length());
 397 }
 398
 399 bool IsStringASCII(const string16& str) {
 400   return DoIsStringASCII(str.data(), str.length());
 401 }
 402
 403 #if defined(WCHAR_T_IS_UTF32)
 404 bool IsStringASCII(const std::wstring& str) {
 405   return DoIsStringASCII(str.data(), str.length());
 406 }
 407 #endif
 408
 409 bool IsStringUTF8(const StringPiece& str) {
 410   const char *src = str.data();
 411   int32 src_len = static_cast<int32>(str.length());
 412   int32 char_index = 0;
 413
 414   while (char_index < src_len) {
 415     int32 code_point;
 416     CBU8_NEXT(src, char_index, src_len, code_point);
 417     if (!IsValidCharacter(code_point))
 418       return false;
 419   }
 420   return true;
 421 }
 422
 423 }  // namespace base
 424
 425 template<typename Iter>
 426 static inline bool DoLowerCaseEqualsASCII(Iter a_begin,
 427                                           Iter a_end,
 428                                           const char* b) {
 429   for (Iter it = a_begin; it != a_end; ++it, ++b) {
 430     if (!*b || base::ToLowerASCII(*it) != *b)
 431       return false;
 432   }
 433   return *b == 0;
 434 }
 435
 436 // Front-ends for LowerCaseEqualsASCII.
 437 bool LowerCaseEqualsASCII(const std::string& a, const char* b) {
 438   return DoLowerCaseEqualsASCII(a.begin(), a.end(), b);
 439 }
 440
 441 bool LowerCaseEqualsASCII(const string16& a, const char* b) {
 442   return DoLowerCaseEqualsASCII(a.begin(), a.end(), b);
 443 }
 444
 445 bool LowerCaseEqualsASCII(std::string::const_iterator a_begin,
 446                           std::string::const_iterator a_end,
 447                           const char* b) {
 448   return DoLowerCaseEqualsASCII(a_begin, a_end, b);
 449 }
 450
 451 bool LowerCaseEqualsASCII(string16::const_iterator a_begin,
 452                           string16::const_iterator a_end,
 453                           const char* b) {
 454   return DoLowerCaseEqualsASCII(a_begin, a_end, b);
 455 }
 456
 457 bool LowerCaseEqualsASCII(const char* a_begin,
 458                           const char* a_end,
 459                           const char* b) {
 460   return DoLowerCaseEqualsASCII(a_begin, a_end, b);
 461 }
 462
 463 bool LowerCaseEqualsASCII(const char16* a_begin,
 464                           const char16* a_end,
 465                           const char* b) {
 466   return DoLowerCaseEqualsASCII(a_begin, a_end, b);
 467 }
 468
 469 bool EqualsASCII(const string16& a, const base::StringPiece& b) {
 470   if (a.length() != b.length())
 471     return false;
 472   return std::equal(b.begin(), b.end(), a.begin());
 473 }
 474
 475 bool StartsWithASCII(const std::string& str,
 476                      const std::string& search,
 477                      bool case_sensitive) {
 478   if (case_sensitive)
 479     return str.compare(0, search.length(), search) == 0;
 480   else
 481     return base::strncasecmp(str.c_str(), search.c_str(), search.length()) == 0;
 482 }
 483
 484 template <typename STR>
 485 bool StartsWithT(const STR& str, const STR& search, bool case_sensitive) {
 486   if (case_sensitive) {
 487     return str.compare(0, search.length(), search) == 0;
 488   } else {
 489     if (search.size() > str.size())
 490       return false;
 491     return std::equal(search.begin(), search.end(), str.begin(),
 492                       base::CaseInsensitiveCompare<typename STR::value_type>());
 493   }
 494 }
 495
 496 bool StartsWith(const string16& str, const string16& search,
 497                 bool case_sensitive) {
 498   return StartsWithT(str, search, case_sensitive);
 499 }
 500
 501 template <typename STR>
 502 bool EndsWithT(const STR& str, const STR& search, bool case_sensitive) {
 503   size_t str_length = str.length();
 504   size_t search_length = search.length();
 505   if (search_length > str_length)
 506     return false;
 507   if (case_sensitive)
 508     return str.compare(str_length - search_length, search_length, search) == 0;
 509   return std::equal(search.begin(), search.end(),
 510                     str.begin() + (str_length - search_length),
 511                     base::CaseInsensitiveCompare<typename STR::value_type>());
 512 }
 513
 514 bool EndsWith(const std::string& str, const std::string& search,
 515               bool case_sensitive) {
 516   return EndsWithT(str, search, case_sensitive);
 517 }
 518
 519 bool EndsWith(const string16& str, const string16& search,
 520               bool case_sensitive) {
 521   return EndsWithT(str, search, case_sensitive);
 522 }
 523
 524 static const char* const kByteStringsUnlocalized[] = {
 525   " B",
 526   " kB",
 527   " MB",
 528   " GB",
 529   " TB",
 530   " PB"
 531 };
 532
 533 string16 FormatBytesUnlocalized(int64 bytes) {
 534   double unit_amount = static_cast<double>(bytes);
 535   size_t dimension = 0;
 536   const int kKilo = 1024;
 537   while (unit_amount >= kKilo &&
 538          dimension < arraysize(kByteStringsUnlocalized) - 1) {
 539     unit_amount /= kKilo;
 540     dimension++;
 541   }
 542
 543   char buf[64];
 544   if (bytes != 0 && dimension > 0 && unit_amount < 100) {
 545     base::snprintf(buf, arraysize(buf), "%.1lf%s", unit_amount,
 546                    kByteStringsUnlocalized[dimension]);
 547   } else {
 548     base::snprintf(buf, arraysize(buf), "%.0lf%s", unit_amount,
 549                    kByteStringsUnlocalized[dimension]);
 550   }
 551
 552   return base::ASCIIToUTF16(buf);
 553 }
 554
 555 // Runs in O(n) time in the length of |str|.
 556 template<class StringType>
 557 void DoReplaceSubstringsAfterOffset(StringType* str,
 558                                     size_t offset,
 559                                     const StringType& find_this,
 560                                     const StringType& replace_with,
 561                                     bool replace_all) {
 562   DCHECK(!find_this.empty());
 563
 564   // If the find string doesn't appear, there's nothing to do.
 565   offset = str->find(find_this, offset);
 566   if (offset == StringType::npos)
 567     return;
 568
 569   // If we're only replacing one instance, there's no need to do anything
 570   // complicated.
 571   size_t find_length = find_this.length();
 572   if (!replace_all) {
 573     str->replace(offset, find_length, replace_with);
 574     return;
 575   }
 576
 577   // If the find and replace strings are the same length, we can simply use
 578   // replace() on each instance, and finish the entire operation in O(n) time.
 579   size_t replace_length = replace_with.length();
 580   if (find_length == replace_length) {
 581     do {
 582       str->replace(offset, find_length, replace_with);
 583       offset = str->find(find_this, offset + replace_length);
 584     } while (offset != StringType::npos);
 585     return;
 586   }
 587
 588   // Since the find and replace strings aren't the same length, a loop like the
 589   // one above would be O(n^2) in the worst case, as replace() will shift the
 590   // entire remaining string each time.  We need to be more clever to keep
 591   // things O(n).
 592   //
 593   // If we're shortening the string, we can alternate replacements with shifting
 594   // forward the intervening characters using memmove().
 595   size_t str_length = str->length();
 596   if (find_length > replace_length) {
 597     size_t write_offset = offset;
 598     do {
 599       if (replace_length) {
 600         str->replace(write_offset, replace_length, replace_with);
 601         write_offset += replace_length;
 602       }
 603       size_t read_offset = offset + find_length;
 604       offset = std::min(str->find(find_this, read_offset), str_length);
 605       size_t length = offset - read_offset;
 606       if (length) {
 607         memmove(&(*str)[write_offset], &(*str)[read_offset],
 608                 length * sizeof(typename StringType::value_type));
 609         write_offset += length;
 610       }
 611     } while (offset < str_length);
 612     str->resize(write_offset);
 613     return;
 614   }
 615
 616   // We're lengthening the string.  We can use alternating replacements and
 617   // memmove() calls like above, but we need to precalculate the final string
 618   // length and then expand from back-to-front to avoid overwriting the string
 619   // as we're reading it, needing to shift, or having to copy to a second string
 620   // temporarily.
 621   size_t first_match = offset;
 622
 623   // First, calculate the final length and resize the string.
 624   size_t final_length = str_length;
 625   size_t expansion = replace_length - find_length;
 626   size_t current_match;
 627   do {
 628     final_length += expansion;
 629     // Minor optimization: save this offset into |current_match|, so that on
 630     // exit from the loop, |current_match| will point at the last instance of
 631     // the find string, and we won't need to find() it again immediately.
 632     current_match = offset;
 633     offset = str->find(find_this, offset + find_length);
 634   } while (offset != StringType::npos);
 635   str->resize(final_length);
 636
 637   // Now do the replacement loop, working backwards through the string.
 638   for (size_t prev_match = str_length, write_offset = final_length; ;
 639        current_match = str->rfind(find_this, current_match - 1)) {
 640     size_t read_offset = current_match + find_length;
 641     size_t length = prev_match - read_offset;
 642     if (length) {
 643       write_offset -= length;
 644       memmove(&(*str)[write_offset], &(*str)[read_offset],
 645               length * sizeof(typename StringType::value_type));
 646     }
 647     write_offset -= replace_length;
 648     str->replace(write_offset, replace_length, replace_with);
 649     if (current_match == first_match)
 650       return;
 651     prev_match = current_match;
 652   }
 653 }
 654
 655 void ReplaceFirstSubstringAfterOffset(string16* str,
 656                                       size_t start_offset,
 657                                       const string16& find_this,
 658                                       const string16& replace_with) {
 659   DoReplaceSubstringsAfterOffset(str, start_offset, find_this, replace_with,
 660                                  false);  // replace first instance
 661 }
 662
 663 void ReplaceFirstSubstringAfterOffset(std::string* str,
 664                                       size_t start_offset,
 665                                       const std::string& find_this,
 666                                       const std::string& replace_with) {
 667   DoReplaceSubstringsAfterOffset(str, start_offset, find_this, replace_with,
 668                                  false);  // replace first instance
 669 }
 670
 671 void ReplaceSubstringsAfterOffset(string16* str,
 672                                   size_t start_offset,
 673                                   const string16& find_this,
 674                                   const string16& replace_with) {
 675   DoReplaceSubstringsAfterOffset(str, start_offset, find_this, replace_with,
 676                                  true);  // replace all instances
 677 }
 678
 679 void ReplaceSubstringsAfterOffset(std::string* str,
 680                                   size_t start_offset,
 681                                   const std::string& find_this,
 682                                   const std::string& replace_with) {
 683   DoReplaceSubstringsAfterOffset(str, start_offset, find_this, replace_with,
 684                                  true);  // replace all instances
 685 }
 686
 687
 688 template<typename STR>
 689 static size_t TokenizeT(const STR& str,
 690                         const STR& delimiters,
 691                         std::vector<STR>* tokens) {
 692   tokens->clear();
 693
 694   size_t start = str.find_first_not_of(delimiters);
 695   while (start != STR::npos) {
 696     size_t end = str.find_first_of(delimiters, start + 1);
 697     if (end == STR::npos) {
 698       tokens->push_back(str.substr(start));
 699       break;
 700     } else {
 701       tokens->push_back(str.substr(start, end - start));
 702       start = str.find_first_not_of(delimiters, end + 1);
 703     }
 704   }
 705
 706   return tokens->size();
 707 }
 708
 709 size_t Tokenize(const string16& str,
 710                 const string16& delimiters,
 711                 std::vector<string16>* tokens) {
 712   return TokenizeT(str, delimiters, tokens);
 713 }
 714
 715 size_t Tokenize(const std::string& str,
 716                 const std::string& delimiters,
 717                 std::vector<std::string>* tokens) {
 718   return TokenizeT(str, delimiters, tokens);
 719 }
 720
 721 size_t Tokenize(const base::StringPiece& str,
 722                 const base::StringPiece& delimiters,
 723                 std::vector<base::StringPiece>* tokens) {
 724   return TokenizeT(str, delimiters, tokens);
 725 }
 726
 727 template<typename STR>
 728 static STR JoinStringT(const std::vector<STR>& parts, const STR& sep) {
 729   if (parts.empty())
 730     return STR();
 731
 732   STR result(parts[0]);
 733   typename std::vector<STR>::const_iterator iter = parts.begin();
 734   ++iter;
 735
 736   for (; iter != parts.end(); ++iter) {
 737     result += sep;
 738     result += *iter;
 739   }
 740
 741   return result;
 742 }
 743
 744 std::string JoinString(const std::vector<std::string>& parts, char sep) {
 745   return JoinStringT(parts, std::string(1, sep));
 746 }
 747
 748 string16 JoinString(const std::vector<string16>& parts, char16 sep) {
 749   return JoinStringT(parts, string16(1, sep));
 750 }
 751
 752 std::string JoinString(const std::vector<std::string>& parts,
 753                        const std::string& separator) {
 754   return JoinStringT(parts, separator);
 755 }
 756
 757 string16 JoinString(const std::vector<string16>& parts,
 758                     const string16& separator) {
 759   return JoinStringT(parts, separator);
 760 }
 761
 762 template<class FormatStringType, class OutStringType>
 763 OutStringType DoReplaceStringPlaceholders(const FormatStringType& format_string,
 764     const std::vector<OutStringType>& subst, std::vector<size_t>* offsets) {
 765   size_t substitutions = subst.size();
 766
 767   size_t sub_length = 0;
 768   for (typename std::vector<OutStringType>::const_iterator iter = subst.begin();
 769        iter != subst.end(); ++iter) {
 770     sub_length += iter->length();
 771   }
 772
 773   OutStringType formatted;
 774   formatted.reserve(format_string.length() + sub_length);
 775
 776   std::vector<ReplacementOffset> r_offsets;
 777   for (typename FormatStringType::const_iterator i = format_string.begin();
 778        i != format_string.end(); ++i) {
 779     if ('$' == *i) {
 780       if (i + 1 != format_string.end()) {
 781         ++i;
 782         DCHECK('$' == *i || '1' <= *i) << "Invalid placeholder: " << *i;
 783         if ('$' == *i) {
 784           while (i != format_string.end() && '$' == *i) {
 785             formatted.push_back('$');
 786             ++i;
 787           }
 788           --i;
 789         } else {
 790           uintptr_t index = 0;
 791           while (i != format_string.end() && '0' <= *i && *i <= '9') {
 792             index *= 10;
 793             index += *i - '0';
 794             ++i;
 795           }
 796           --i;
 797           index -= 1;
 798           if (offsets) {
 799             ReplacementOffset r_offset(index,
 800                 static_cast<int>(formatted.size()));
 801             r_offsets.insert(std::lower_bound(r_offsets.begin(),
 802                                               r_offsets.end(),
 803                                               r_offset,
 804                                               &CompareParameter),
 805                              r_offset);
 806           }
 807           if (index < substitutions)
 808             formatted.append(subst.at(index));
 809         }
 810       }
 811     } else {
 812       formatted.push_back(*i);
 813     }
 814   }
 815   if (offsets) {
 816     for (std::vector<ReplacementOffset>::const_iterator i = r_offsets.begin();
 817          i != r_offsets.end(); ++i) {
 818       offsets->push_back(i->offset);
 819     }
 820   }
 821   return formatted;
 822 }
 823
 824 string16 ReplaceStringPlaceholders(const string16& format_string,
 825                                    const std::vector<string16>& subst,
 826                                    std::vector<size_t>* offsets) {
 827   return DoReplaceStringPlaceholders(format_string, subst, offsets);
 828 }
 829
 830 std::string ReplaceStringPlaceholders(const base::StringPiece& format_string,
 831                                       const std::vector<std::string>& subst,
 832                                       std::vector<size_t>* offsets) {
 833   return DoReplaceStringPlaceholders(format_string, subst, offsets);
 834 }
 835
 836 string16 ReplaceStringPlaceholders(const string16& format_string,
 837                                    const string16& a,
 838                                    size_t* offset) {
 839   std::vector<size_t> offsets;
 840   std::vector<string16> subst;
 841   subst.push_back(a);
 842   string16 result = ReplaceStringPlaceholders(format_string, subst, &offsets);
 843
 844   DCHECK_EQ(1U, offsets.size());
 845   if (offset)
 846     *offset = offsets[0];
 847   return result;
 848 }
 849
 850 static bool IsWildcard(base_icu::UChar32 character) {
 851   return character == '*' || character == '?';
 852 }
 853
 854 // Move the strings pointers to the point where they start to differ.
 855 template <typename CHAR, typename NEXT>
 856 static void EatSameChars(const CHAR** pattern, const CHAR* pattern_end,
 857                          const CHAR** string, const CHAR* string_end,
 858                          NEXT next) {
 859   const CHAR* escape = NULL;
 860   while (*pattern != pattern_end && *string != string_end) {
 861     if (!escape && IsWildcard(**pattern)) {
 862       // We don't want to match wildcard here, except if it's escaped.
 863       return;
 864     }
 865
 866     // Check if the escapement char is found. If so, skip it and move to the
 867     // next character.
 868     if (!escape && **pattern == '\\') {
 869       escape = *pattern;
 870       next(pattern, pattern_end);
 871       continue;
 872     }
 873
 874     // Check if the chars match, if so, increment the ptrs.
 875     const CHAR* pattern_next = *pattern;
 876     const CHAR* string_next = *string;
 877     base_icu::UChar32 pattern_char = next(&pattern_next, pattern_end);
 878     if (pattern_char == next(&string_next, string_end) &&
 879         pattern_char != CBU_SENTINEL) {
 880       *pattern = pattern_next;
 881       *string = string_next;
 882     } else {
 883       // Uh oh, it did not match, we are done. If the last char was an
 884       // escapement, that means that it was an error to advance the ptr here,
 885       // let's put it back where it was. This also mean that the MatchPattern
 886       // function will return false because if we can't match an escape char
 887       // here, then no one will.
 888       if (escape) {
 889         *pattern = escape;
 890       }
 891       return;
 892     }
 893
 894     escape = NULL;
 895   }
 896 }
 897
 898 template <typename CHAR, typename NEXT>
 899 static void EatWildcard(const CHAR** pattern, const CHAR* end, NEXT next) {
 900   while (*pattern != end) {
 901     if (!IsWildcard(**pattern))
 902       return;
 903     next(pattern, end);
 904   }
 905 }
 906
 907 template <typename CHAR, typename NEXT>
 908 static bool MatchPatternT(const CHAR* eval, const CHAR* eval_end,
 909                           const CHAR* pattern, const CHAR* pattern_end,
 910                           int depth,
 911                           NEXT next) {
 912   const int kMaxDepth = 16;
 913   if (depth > kMaxDepth)
 914     return false;
 915
 916   // Eat all the matching chars.
 917   EatSameChars(&pattern, pattern_end, &eval, eval_end, next);
 918
 919   // If the string is empty, then the pattern must be empty too, or contains
 920   // only wildcards.
 921   if (eval == eval_end) {
 922     EatWildcard(&pattern, pattern_end, next);
 923     return pattern == pattern_end;
 924   }
 925
 926   // Pattern is empty but not string, this is not a match.
 927   if (pattern == pattern_end)
 928     return false;
 929
 930   // If this is a question mark, then we need to compare the rest with
 931   // the current string or the string with one character eaten.
 932   const CHAR* next_pattern = pattern;
 933   next(&next_pattern, pattern_end);
 934   if (pattern[0] == '?') {
 935     if (MatchPatternT(eval, eval_end, next_pattern, pattern_end,
 936                       depth + 1, next))
 937       return true;
 938     const CHAR* next_eval = eval;
 939     next(&next_eval, eval_end);
 940     if (MatchPatternT(next_eval, eval_end, next_pattern, pattern_end,
 941                       depth + 1, next))
 942       return true;
 943   }
 944
 945   // This is a *, try to match all the possible substrings with the remainder
 946   // of the pattern.
 947   if (pattern[0] == '*') {
 948     // Collapse duplicate wild cards (********** into *) so that the
 949     // method does not recurse unnecessarily. http://crbug.com/52839
 950     EatWildcard(&next_pattern, pattern_end, next);
 951
 952     while (eval != eval_end) {
 953       if (MatchPatternT(eval, eval_end, next_pattern, pattern_end,
 954                         depth + 1, next))
 955         return true;
 956       eval++;
 957     }
 958
 959     // We reached the end of the string, let see if the pattern contains only
 960     // wildcards.
 961     if (eval == eval_end) {
 962       EatWildcard(&pattern, pattern_end, next);
 963       if (pattern != pattern_end)
 964         return false;
 965       return true;
 966     }
 967   }
 968
 969   return false;
 970 }
 971
 972 struct NextCharUTF8 {
 973   base_icu::UChar32 operator()(const char** p, const char* end) {
 974     base_icu::UChar32 c;
 975     int offset = 0;
 976     CBU8_NEXT(*p, offset, end - *p, c);
 977     *p += offset;
 978     return c;
 979   }
 980 };
 981
 982 struct NextCharUTF16 {
 983   base_icu::UChar32 operator()(const char16** p, const char16* end) {
 984     base_icu::UChar32 c;
 985     int offset = 0;
 986     CBU16_NEXT(*p, offset, end - *p, c);
 987     *p += offset;
 988     return c;
 989   }
 990 };
 991
 992 bool MatchPattern(const base::StringPiece& eval,
 993                   const base::StringPiece& pattern) {
 994   return MatchPatternT(eval.data(), eval.data() + eval.size(),
 995                        pattern.data(), pattern.data() + pattern.size(),
 996                        0, NextCharUTF8());
 997 }
 998
 999 bool MatchPattern(const string16& eval, const string16& pattern) {
1000   return MatchPatternT(eval.c_str(), eval.c_str() + eval.size(),
1001                        pattern.c_str(), pattern.c_str() + pattern.size(),
1002                        0, NextCharUTF16());
1003 }
1004
1005 // The following code is compatible with the OpenBSD lcpy interface.  See:
1006 //   http://www.gratisoft.us/todd/papers/strlcpy.html
1007 //   ftp://ftp.openbsd.org/pub/OpenBSD/src/lib/libc/string/{wcs,str}lcpy.c
1008
1009 namespace {
1010
1011 template <typename CHAR>
1012 size_t lcpyT(CHAR* dst, const CHAR* src, size_t dst_size) {
1013   for (size_t i = 0; i < dst_size; ++i) {
1014     if ((dst[i] = src[i]) == 0)  // We hit and copied the terminating NULL.
1015       return i;
1016   }
1017
1018   // We were left off at dst_size.  We over copied 1 byte.  Null terminate.
1019   if (dst_size != 0)
1020     dst[dst_size - 1] = 0;
1021
1022   // Count the rest of the |src|, and return it's length in characters.
1023   while (src[dst_size]) ++dst_size;
1024   return dst_size;
1025 }
1026
1027 }  // namespace
1028
1029 size_t base::strlcpy(char* dst, const char* src, size_t dst_size) {
1030   return lcpyT<char>(dst, src, dst_size);
1031 }
1032 size_t base::wcslcpy(wchar_t* dst, const wchar_t* src, size_t dst_size) {
1033   return lcpyT<wchar_t>(dst, src, dst_size);
1034 }