Disable crashing tests, my previous checkin to mark them flaky did not help.
[chromium-blink-merge.git] / base / string_util.cc
blob1ad9abe5d8396f20dcefd5170aedf4e5644391ef
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "base/string_util.h"
7 #include "build/build_config.h"
9 #include <ctype.h>
10 #include <errno.h>
11 #include <math.h>
12 #include <stdarg.h>
13 #include <stdio.h>
14 #include <stdlib.h>
15 #include <string.h>
16 #include <time.h>
17 #include <wchar.h>
18 #include <wctype.h>
20 #include <algorithm>
21 #include <vector>
23 #include "base/basictypes.h"
24 #include "base/logging.h"
25 #include "base/memory/singleton.h"
26 #include "base/third_party/dmg_fp/dmg_fp.h"
27 #include "base/utf_string_conversion_utils.h"
28 #include "base/utf_string_conversions.h"
29 #include "base/third_party/icu/icu_utf.h"
31 namespace {
33 // Force the singleton used by Empty[W]String[16] to be a unique type. This
34 // prevents other code that might accidentally use Singleton<string> from
35 // getting our internal one.
36 struct EmptyStrings {
37 EmptyStrings() {}
38 const std::string s;
39 const std::wstring ws;
40 const string16 s16;
42 static EmptyStrings* GetInstance() {
43 return Singleton<EmptyStrings>::get();
47 // Used by ReplaceStringPlaceholders to track the position in the string of
48 // replaced parameters.
49 struct ReplacementOffset {
50 ReplacementOffset(uintptr_t parameter, size_t offset)
51 : parameter(parameter),
52 offset(offset) {}
54 // Index of the parameter.
55 uintptr_t parameter;
57 // Starting position in the string.
58 size_t offset;
61 static bool CompareParameter(const ReplacementOffset& elem1,
62 const ReplacementOffset& elem2) {
63 return elem1.parameter < elem2.parameter;
66 } // namespace
68 namespace base {
70 bool IsWprintfFormatPortable(const wchar_t* format) {
71 for (const wchar_t* position = format; *position != '\0'; ++position) {
72 if (*position == '%') {
73 bool in_specification = true;
74 bool modifier_l = false;
75 while (in_specification) {
76 // Eat up characters until reaching a known specifier.
77 if (*++position == '\0') {
78 // The format string ended in the middle of a specification. Call
79 // it portable because no unportable specifications were found. The
80 // string is equally broken on all platforms.
81 return true;
84 if (*position == 'l') {
85 // 'l' is the only thing that can save the 's' and 'c' specifiers.
86 modifier_l = true;
87 } else if (((*position == 's' || *position == 'c') && !modifier_l) ||
88 *position == 'S' || *position == 'C' || *position == 'F' ||
89 *position == 'D' || *position == 'O' || *position == 'U') {
90 // Not portable.
91 return false;
94 if (wcschr(L"diouxXeEfgGaAcspn%", *position)) {
95 // Portable, keep scanning the rest of the format string.
96 in_specification = false;
102 return true;
105 } // namespace base
108 const std::string& EmptyString() {
109 return EmptyStrings::GetInstance()->s;
112 const std::wstring& EmptyWString() {
113 return EmptyStrings::GetInstance()->ws;
116 const string16& EmptyString16() {
117 return EmptyStrings::GetInstance()->s16;
120 #define WHITESPACE_UNICODE \
121 0x0009, /* <control-0009> to <control-000D> */ \
122 0x000A, \
123 0x000B, \
124 0x000C, \
125 0x000D, \
126 0x0020, /* Space */ \
127 0x0085, /* <control-0085> */ \
128 0x00A0, /* No-Break Space */ \
129 0x1680, /* Ogham Space Mark */ \
130 0x180E, /* Mongolian Vowel Separator */ \
131 0x2000, /* En Quad to Hair Space */ \
132 0x2001, \
133 0x2002, \
134 0x2003, \
135 0x2004, \
136 0x2005, \
137 0x2006, \
138 0x2007, \
139 0x2008, \
140 0x2009, \
141 0x200A, \
142 0x200C, /* Zero Width Non-Joiner */ \
143 0x2028, /* Line Separator */ \
144 0x2029, /* Paragraph Separator */ \
145 0x202F, /* Narrow No-Break Space */ \
146 0x205F, /* Medium Mathematical Space */ \
147 0x3000, /* Ideographic Space */ \
150 const wchar_t kWhitespaceWide[] = {
151 WHITESPACE_UNICODE
153 const char16 kWhitespaceUTF16[] = {
154 WHITESPACE_UNICODE
156 const char kWhitespaceASCII[] = {
157 0x09, // <control-0009> to <control-000D>
158 0x0A,
159 0x0B,
160 0x0C,
161 0x0D,
162 0x20, // Space
166 const char kUtf8ByteOrderMark[] = "\xEF\xBB\xBF";
168 template<typename STR>
169 bool RemoveCharsT(const STR& input,
170 const typename STR::value_type remove_chars[],
171 STR* output) {
172 bool removed = false;
173 size_t found;
175 *output = input;
177 found = output->find_first_of(remove_chars);
178 while (found != STR::npos) {
179 removed = true;
180 output->replace(found, 1, STR());
181 found = output->find_first_of(remove_chars, found);
184 return removed;
187 bool RemoveChars(const std::wstring& input,
188 const wchar_t remove_chars[],
189 std::wstring* output) {
190 return RemoveCharsT(input, remove_chars, output);
193 #if !defined(WCHAR_T_IS_UTF16)
194 bool RemoveChars(const string16& input,
195 const char16 remove_chars[],
196 string16* output) {
197 return RemoveCharsT(input, remove_chars, output);
199 #endif
201 bool RemoveChars(const std::string& input,
202 const char remove_chars[],
203 std::string* output) {
204 return RemoveCharsT(input, remove_chars, output);
207 template<typename STR>
208 TrimPositions TrimStringT(const STR& input,
209 const typename STR::value_type trim_chars[],
210 TrimPositions positions,
211 STR* output) {
212 // Find the edges of leading/trailing whitespace as desired.
213 const typename STR::size_type last_char = input.length() - 1;
214 const typename STR::size_type first_good_char = (positions & TRIM_LEADING) ?
215 input.find_first_not_of(trim_chars) : 0;
216 const typename STR::size_type last_good_char = (positions & TRIM_TRAILING) ?
217 input.find_last_not_of(trim_chars) : last_char;
219 // When the string was all whitespace, report that we stripped off whitespace
220 // from whichever position the caller was interested in. For empty input, we
221 // stripped no whitespace, but we still need to clear |output|.
222 if (input.empty() ||
223 (first_good_char == STR::npos) || (last_good_char == STR::npos)) {
224 bool input_was_empty = input.empty(); // in case output == &input
225 output->clear();
226 return input_was_empty ? TRIM_NONE : positions;
229 // Trim the whitespace.
230 *output =
231 input.substr(first_good_char, last_good_char - first_good_char + 1);
233 // Return where we trimmed from.
234 return static_cast<TrimPositions>(
235 ((first_good_char == 0) ? TRIM_NONE : TRIM_LEADING) |
236 ((last_good_char == last_char) ? TRIM_NONE : TRIM_TRAILING));
239 bool TrimString(const std::wstring& input,
240 const wchar_t trim_chars[],
241 std::wstring* output) {
242 return TrimStringT(input, trim_chars, TRIM_ALL, output) != TRIM_NONE;
245 #if !defined(WCHAR_T_IS_UTF16)
246 bool TrimString(const string16& input,
247 const char16 trim_chars[],
248 string16* output) {
249 return TrimStringT(input, trim_chars, TRIM_ALL, output) != TRIM_NONE;
251 #endif
253 bool TrimString(const std::string& input,
254 const char trim_chars[],
255 std::string* output) {
256 return TrimStringT(input, trim_chars, TRIM_ALL, output) != TRIM_NONE;
259 void TruncateUTF8ToByteSize(const std::string& input,
260 const size_t byte_size,
261 std::string* output) {
262 DCHECK(output);
263 if (byte_size > input.length()) {
264 *output = input;
265 return;
267 DCHECK_LE(byte_size, static_cast<uint32>(kint32max));
268 // Note: This cast is necessary because CBU8_NEXT uses int32s.
269 int32 truncation_length = static_cast<int32>(byte_size);
270 int32 char_index = truncation_length - 1;
271 const char* data = input.data();
273 // Using CBU8, we will move backwards from the truncation point
274 // to the beginning of the string looking for a valid UTF8
275 // character. Once a full UTF8 character is found, we will
276 // truncate the string to the end of that character.
277 while (char_index >= 0) {
278 int32 prev = char_index;
279 uint32 code_point = 0;
280 CBU8_NEXT(data, char_index, truncation_length, code_point);
281 if (!base::IsValidCharacter(code_point) ||
282 !base::IsValidCodepoint(code_point)) {
283 char_index = prev - 1;
284 } else {
285 break;
289 if (char_index >= 0 )
290 *output = input.substr(0, char_index);
291 else
292 output->clear();
295 TrimPositions TrimWhitespace(const std::wstring& input,
296 TrimPositions positions,
297 std::wstring* output) {
298 return TrimStringT(input, kWhitespaceWide, positions, output);
301 #if !defined(WCHAR_T_IS_UTF16)
302 TrimPositions TrimWhitespace(const string16& input,
303 TrimPositions positions,
304 string16* output) {
305 return TrimStringT(input, kWhitespaceUTF16, positions, output);
307 #endif
309 TrimPositions TrimWhitespaceASCII(const std::string& input,
310 TrimPositions positions,
311 std::string* output) {
312 return TrimStringT(input, kWhitespaceASCII, positions, output);
315 // This function is only for backward-compatibility.
316 // To be removed when all callers are updated.
317 TrimPositions TrimWhitespace(const std::string& input,
318 TrimPositions positions,
319 std::string* output) {
320 return TrimWhitespaceASCII(input, positions, output);
323 template<typename STR>
324 STR CollapseWhitespaceT(const STR& text,
325 bool trim_sequences_with_line_breaks) {
326 STR result;
327 result.resize(text.size());
329 // Set flags to pretend we're already in a trimmed whitespace sequence, so we
330 // will trim any leading whitespace.
331 bool in_whitespace = true;
332 bool already_trimmed = true;
334 int chars_written = 0;
335 for (typename STR::const_iterator i(text.begin()); i != text.end(); ++i) {
336 if (IsWhitespace(*i)) {
337 if (!in_whitespace) {
338 // Reduce all whitespace sequences to a single space.
339 in_whitespace = true;
340 result[chars_written++] = L' ';
342 if (trim_sequences_with_line_breaks && !already_trimmed &&
343 ((*i == '\n') || (*i == '\r'))) {
344 // Whitespace sequences containing CR or LF are eliminated entirely.
345 already_trimmed = true;
346 --chars_written;
348 } else {
349 // Non-whitespace chracters are copied straight across.
350 in_whitespace = false;
351 already_trimmed = false;
352 result[chars_written++] = *i;
356 if (in_whitespace && !already_trimmed) {
357 // Any trailing whitespace is eliminated.
358 --chars_written;
361 result.resize(chars_written);
362 return result;
365 std::wstring CollapseWhitespace(const std::wstring& text,
366 bool trim_sequences_with_line_breaks) {
367 return CollapseWhitespaceT(text, trim_sequences_with_line_breaks);
370 #if !defined(WCHAR_T_IS_UTF16)
371 string16 CollapseWhitespace(const string16& text,
372 bool trim_sequences_with_line_breaks) {
373 return CollapseWhitespaceT(text, trim_sequences_with_line_breaks);
375 #endif
377 std::string CollapseWhitespaceASCII(const std::string& text,
378 bool trim_sequences_with_line_breaks) {
379 return CollapseWhitespaceT(text, trim_sequences_with_line_breaks);
382 bool ContainsOnlyWhitespaceASCII(const std::string& str) {
383 for (std::string::const_iterator i(str.begin()); i != str.end(); ++i) {
384 if (!IsAsciiWhitespace(*i))
385 return false;
387 return true;
390 bool ContainsOnlyWhitespace(const string16& str) {
391 for (string16::const_iterator i(str.begin()); i != str.end(); ++i) {
392 if (!IsWhitespace(*i))
393 return false;
395 return true;
398 template<typename STR>
399 static bool ContainsOnlyCharsT(const STR& input, const STR& characters) {
400 for (typename STR::const_iterator iter = input.begin();
401 iter != input.end(); ++iter) {
402 if (characters.find(*iter) == STR::npos)
403 return false;
405 return true;
408 bool ContainsOnlyChars(const std::wstring& input,
409 const std::wstring& characters) {
410 return ContainsOnlyCharsT(input, characters);
413 #if !defined(WCHAR_T_IS_UTF16)
414 bool ContainsOnlyChars(const string16& input, const string16& characters) {
415 return ContainsOnlyCharsT(input, characters);
417 #endif
419 bool ContainsOnlyChars(const std::string& input,
420 const std::string& characters) {
421 return ContainsOnlyCharsT(input, characters);
424 std::string WideToASCII(const std::wstring& wide) {
425 DCHECK(IsStringASCII(wide)) << wide;
426 return std::string(wide.begin(), wide.end());
429 std::string UTF16ToASCII(const string16& utf16) {
430 DCHECK(IsStringASCII(utf16)) << utf16;
431 return std::string(utf16.begin(), utf16.end());
434 // Latin1 is just the low range of Unicode, so we can copy directly to convert.
435 bool WideToLatin1(const std::wstring& wide, std::string* latin1) {
436 std::string output;
437 output.resize(wide.size());
438 latin1->clear();
439 for (size_t i = 0; i < wide.size(); i++) {
440 if (wide[i] > 255)
441 return false;
442 output[i] = static_cast<char>(wide[i]);
444 latin1->swap(output);
445 return true;
448 template<class STR>
449 static bool DoIsStringASCII(const STR& str) {
450 for (size_t i = 0; i < str.length(); i++) {
451 typename ToUnsigned<typename STR::value_type>::Unsigned c = str[i];
452 if (c > 0x7F)
453 return false;
455 return true;
458 bool IsStringASCII(const std::wstring& str) {
459 return DoIsStringASCII(str);
462 #if !defined(WCHAR_T_IS_UTF16)
463 bool IsStringASCII(const string16& str) {
464 return DoIsStringASCII(str);
466 #endif
468 bool IsStringASCII(const base::StringPiece& str) {
469 return DoIsStringASCII(str);
472 bool IsStringUTF8(const std::string& str) {
473 const char *src = str.data();
474 int32 src_len = static_cast<int32>(str.length());
475 int32 char_index = 0;
477 while (char_index < src_len) {
478 int32 code_point;
479 CBU8_NEXT(src, char_index, src_len, code_point);
480 if (!base::IsValidCharacter(code_point))
481 return false;
483 return true;
486 template<typename Iter>
487 static inline bool DoLowerCaseEqualsASCII(Iter a_begin,
488 Iter a_end,
489 const char* b) {
490 for (Iter it = a_begin; it != a_end; ++it, ++b) {
491 if (!*b || base::ToLowerASCII(*it) != *b)
492 return false;
494 return *b == 0;
497 // Front-ends for LowerCaseEqualsASCII.
498 bool LowerCaseEqualsASCII(const std::string& a, const char* b) {
499 return DoLowerCaseEqualsASCII(a.begin(), a.end(), b);
502 bool LowerCaseEqualsASCII(const std::wstring& a, const char* b) {
503 return DoLowerCaseEqualsASCII(a.begin(), a.end(), b);
506 #if !defined(WCHAR_T_IS_UTF16)
507 bool LowerCaseEqualsASCII(const string16& a, const char* b) {
508 return DoLowerCaseEqualsASCII(a.begin(), a.end(), b);
510 #endif
512 bool LowerCaseEqualsASCII(std::string::const_iterator a_begin,
513 std::string::const_iterator a_end,
514 const char* b) {
515 return DoLowerCaseEqualsASCII(a_begin, a_end, b);
518 bool LowerCaseEqualsASCII(std::wstring::const_iterator a_begin,
519 std::wstring::const_iterator a_end,
520 const char* b) {
521 return DoLowerCaseEqualsASCII(a_begin, a_end, b);
524 #if !defined(WCHAR_T_IS_UTF16)
525 bool LowerCaseEqualsASCII(string16::const_iterator a_begin,
526 string16::const_iterator a_end,
527 const char* b) {
528 return DoLowerCaseEqualsASCII(a_begin, a_end, b);
530 #endif
532 bool LowerCaseEqualsASCII(const char* a_begin,
533 const char* a_end,
534 const char* b) {
535 return DoLowerCaseEqualsASCII(a_begin, a_end, b);
538 bool LowerCaseEqualsASCII(const wchar_t* a_begin,
539 const wchar_t* a_end,
540 const char* b) {
541 return DoLowerCaseEqualsASCII(a_begin, a_end, b);
544 #if !defined(WCHAR_T_IS_UTF16)
545 bool LowerCaseEqualsASCII(const char16* a_begin,
546 const char16* a_end,
547 const char* b) {
548 return DoLowerCaseEqualsASCII(a_begin, a_end, b);
550 #endif
552 bool EqualsASCII(const string16& a, const base::StringPiece& b) {
553 if (a.length() != b.length())
554 return false;
555 return std::equal(b.begin(), b.end(), a.begin());
558 bool StartsWithASCII(const std::string& str,
559 const std::string& search,
560 bool case_sensitive) {
561 if (case_sensitive)
562 return str.compare(0, search.length(), search) == 0;
563 else
564 return base::strncasecmp(str.c_str(), search.c_str(), search.length()) == 0;
567 template <typename STR>
568 bool StartsWithT(const STR& str, const STR& search, bool case_sensitive) {
569 if (case_sensitive) {
570 return str.compare(0, search.length(), search) == 0;
571 } else {
572 if (search.size() > str.size())
573 return false;
574 return std::equal(search.begin(), search.end(), str.begin(),
575 base::CaseInsensitiveCompare<typename STR::value_type>());
579 bool StartsWith(const std::wstring& str, const std::wstring& search,
580 bool case_sensitive) {
581 return StartsWithT(str, search, case_sensitive);
584 #if !defined(WCHAR_T_IS_UTF16)
585 bool StartsWith(const string16& str, const string16& search,
586 bool case_sensitive) {
587 return StartsWithT(str, search, case_sensitive);
589 #endif
591 template <typename STR>
592 bool EndsWithT(const STR& str, const STR& search, bool case_sensitive) {
593 typename STR::size_type str_length = str.length();
594 typename STR::size_type search_length = search.length();
595 if (search_length > str_length)
596 return false;
597 if (case_sensitive) {
598 return str.compare(str_length - search_length, search_length, search) == 0;
599 } else {
600 return std::equal(search.begin(), search.end(),
601 str.begin() + (str_length - search_length),
602 base::CaseInsensitiveCompare<typename STR::value_type>());
606 bool EndsWith(const std::string& str, const std::string& search,
607 bool case_sensitive) {
608 return EndsWithT(str, search, case_sensitive);
611 bool EndsWith(const std::wstring& str, const std::wstring& search,
612 bool case_sensitive) {
613 return EndsWithT(str, search, case_sensitive);
616 #if !defined(WCHAR_T_IS_UTF16)
617 bool EndsWith(const string16& str, const string16& search,
618 bool case_sensitive) {
619 return EndsWithT(str, search, case_sensitive);
621 #endif
623 DataUnits GetByteDisplayUnits(int64 bytes) {
624 // The byte thresholds at which we display amounts. A byte count is displayed
625 // in unit U when kUnitThresholds[U] <= bytes < kUnitThresholds[U+1].
626 // This must match the DataUnits enum.
627 static const int64 kUnitThresholds[] = {
628 0, // DATA_UNITS_BYTE,
629 3*1024, // DATA_UNITS_KIBIBYTE,
630 2*1024*1024, // DATA_UNITS_MEBIBYTE,
631 1024*1024*1024 // DATA_UNITS_GIBIBYTE,
634 if (bytes < 0) {
635 NOTREACHED() << "Negative bytes value";
636 return DATA_UNITS_BYTE;
639 int unit_index = arraysize(kUnitThresholds);
640 while (--unit_index > 0) {
641 if (bytes >= kUnitThresholds[unit_index])
642 break;
645 DCHECK(unit_index >= DATA_UNITS_BYTE && unit_index <= DATA_UNITS_GIBIBYTE);
646 return DataUnits(unit_index);
649 // TODO(mpcomplete): deal with locale
650 // Byte suffixes. This must match the DataUnits enum.
651 static const char* const kByteStrings[] = {
652 "B",
653 "kB",
654 "MB",
655 "GB"
658 static const char* const kSpeedStrings[] = {
659 "B/s",
660 "kB/s",
661 "MB/s",
662 "GB/s"
665 string16 FormatBytesInternal(int64 bytes,
666 DataUnits units,
667 bool show_units,
668 const char* const* suffix) {
669 if (bytes < 0) {
670 NOTREACHED() << "Negative bytes value";
671 return string16();
674 DCHECK(units >= DATA_UNITS_BYTE && units <= DATA_UNITS_GIBIBYTE);
676 // Put the quantity in the right units.
677 double unit_amount = static_cast<double>(bytes);
678 for (int i = 0; i < units; ++i)
679 unit_amount /= 1024.0;
681 char buf[64];
682 if (bytes != 0 && units != DATA_UNITS_BYTE && unit_amount < 100)
683 base::snprintf(buf, arraysize(buf), "%.1lf", unit_amount);
684 else
685 base::snprintf(buf, arraysize(buf), "%.0lf", unit_amount);
687 std::string ret(buf);
688 if (show_units) {
689 ret += " ";
690 ret += suffix[units];
693 return ASCIIToUTF16(ret);
696 string16 FormatBytes(int64 bytes, DataUnits units, bool show_units) {
697 return FormatBytesInternal(bytes, units, show_units, kByteStrings);
700 string16 FormatSpeed(int64 bytes, DataUnits units, bool show_units) {
701 return FormatBytesInternal(bytes, units, show_units, kSpeedStrings);
704 template<class StringType>
705 void DoReplaceSubstringsAfterOffset(StringType* str,
706 typename StringType::size_type start_offset,
707 const StringType& find_this,
708 const StringType& replace_with,
709 bool replace_all) {
710 if ((start_offset == StringType::npos) || (start_offset >= str->length()))
711 return;
713 DCHECK(!find_this.empty());
714 for (typename StringType::size_type offs(str->find(find_this, start_offset));
715 offs != StringType::npos; offs = str->find(find_this, offs)) {
716 str->replace(offs, find_this.length(), replace_with);
717 offs += replace_with.length();
719 if (!replace_all)
720 break;
724 void ReplaceFirstSubstringAfterOffset(string16* str,
725 string16::size_type start_offset,
726 const string16& find_this,
727 const string16& replace_with) {
728 DoReplaceSubstringsAfterOffset(str, start_offset, find_this, replace_with,
729 false); // replace first instance
732 void ReplaceFirstSubstringAfterOffset(std::string* str,
733 std::string::size_type start_offset,
734 const std::string& find_this,
735 const std::string& replace_with) {
736 DoReplaceSubstringsAfterOffset(str, start_offset, find_this, replace_with,
737 false); // replace first instance
740 void ReplaceSubstringsAfterOffset(string16* str,
741 string16::size_type start_offset,
742 const string16& find_this,
743 const string16& replace_with) {
744 DoReplaceSubstringsAfterOffset(str, start_offset, find_this, replace_with,
745 true); // replace all instances
748 void ReplaceSubstringsAfterOffset(std::string* str,
749 std::string::size_type start_offset,
750 const std::string& find_this,
751 const std::string& replace_with) {
752 DoReplaceSubstringsAfterOffset(str, start_offset, find_this, replace_with,
753 true); // replace all instances
757 template<typename STR>
758 static size_t TokenizeT(const STR& str,
759 const STR& delimiters,
760 std::vector<STR>* tokens) {
761 tokens->clear();
763 typename STR::size_type start = str.find_first_not_of(delimiters);
764 while (start != STR::npos) {
765 typename STR::size_type end = str.find_first_of(delimiters, start + 1);
766 if (end == STR::npos) {
767 tokens->push_back(str.substr(start));
768 break;
769 } else {
770 tokens->push_back(str.substr(start, end - start));
771 start = str.find_first_not_of(delimiters, end + 1);
775 return tokens->size();
778 size_t Tokenize(const std::wstring& str,
779 const std::wstring& delimiters,
780 std::vector<std::wstring>* tokens) {
781 return TokenizeT(str, delimiters, tokens);
784 #if !defined(WCHAR_T_IS_UTF16)
785 size_t Tokenize(const string16& str,
786 const string16& delimiters,
787 std::vector<string16>* tokens) {
788 return TokenizeT(str, delimiters, tokens);
790 #endif
792 size_t Tokenize(const std::string& str,
793 const std::string& delimiters,
794 std::vector<std::string>* tokens) {
795 return TokenizeT(str, delimiters, tokens);
798 size_t Tokenize(const base::StringPiece& str,
799 const base::StringPiece& delimiters,
800 std::vector<base::StringPiece>* tokens) {
801 return TokenizeT(str, delimiters, tokens);
804 template<typename STR>
805 static STR JoinStringT(const std::vector<STR>& parts,
806 typename STR::value_type sep) {
807 if (parts.empty())
808 return STR();
810 STR result(parts[0]);
811 typename std::vector<STR>::const_iterator iter = parts.begin();
812 ++iter;
814 for (; iter != parts.end(); ++iter) {
815 result += sep;
816 result += *iter;
819 return result;
822 std::string JoinString(const std::vector<std::string>& parts, char sep) {
823 return JoinStringT(parts, sep);
826 string16 JoinString(const std::vector<string16>& parts, char16 sep) {
827 return JoinStringT(parts, sep);
830 template<class FormatStringType, class OutStringType>
831 OutStringType DoReplaceStringPlaceholders(const FormatStringType& format_string,
832 const std::vector<OutStringType>& subst, std::vector<size_t>* offsets) {
833 size_t substitutions = subst.size();
834 DCHECK(substitutions < 10);
836 size_t sub_length = 0;
837 for (typename std::vector<OutStringType>::const_iterator iter = subst.begin();
838 iter != subst.end(); ++iter) {
839 sub_length += iter->length();
842 OutStringType formatted;
843 formatted.reserve(format_string.length() + sub_length);
845 std::vector<ReplacementOffset> r_offsets;
846 for (typename FormatStringType::const_iterator i = format_string.begin();
847 i != format_string.end(); ++i) {
848 if ('$' == *i) {
849 if (i + 1 != format_string.end()) {
850 ++i;
851 DCHECK('$' == *i || '1' <= *i) << "Invalid placeholder: " << *i;
852 if ('$' == *i) {
853 while (i != format_string.end() && '$' == *i) {
854 formatted.push_back('$');
855 ++i;
857 --i;
858 } else {
859 uintptr_t index = *i - '1';
860 if (offsets) {
861 ReplacementOffset r_offset(index,
862 static_cast<int>(formatted.size()));
863 r_offsets.insert(std::lower_bound(r_offsets.begin(),
864 r_offsets.end(),
865 r_offset,
866 &CompareParameter),
867 r_offset);
869 if (index < substitutions)
870 formatted.append(subst.at(index));
873 } else {
874 formatted.push_back(*i);
877 if (offsets) {
878 for (std::vector<ReplacementOffset>::const_iterator i = r_offsets.begin();
879 i != r_offsets.end(); ++i) {
880 offsets->push_back(i->offset);
883 return formatted;
886 string16 ReplaceStringPlaceholders(const string16& format_string,
887 const std::vector<string16>& subst,
888 std::vector<size_t>* offsets) {
889 return DoReplaceStringPlaceholders(format_string, subst, offsets);
892 std::string ReplaceStringPlaceholders(const base::StringPiece& format_string,
893 const std::vector<std::string>& subst,
894 std::vector<size_t>* offsets) {
895 return DoReplaceStringPlaceholders(format_string, subst, offsets);
898 string16 ReplaceStringPlaceholders(const string16& format_string,
899 const string16& a,
900 size_t* offset) {
901 std::vector<size_t> offsets;
902 std::vector<string16> subst;
903 subst.push_back(a);
904 string16 result = ReplaceStringPlaceholders(format_string, subst, &offsets);
906 DCHECK(offsets.size() == 1);
907 if (offset) {
908 *offset = offsets[0];
910 return result;
913 static bool IsWildcard(base_icu::UChar32 character) {
914 return character == '*' || character == '?';
917 // Move the strings pointers to the point where they start to differ.
918 template <typename CHAR, typename NEXT>
919 static void EatSameChars(const CHAR** pattern, const CHAR* pattern_end,
920 const CHAR** string, const CHAR* string_end,
921 NEXT next) {
922 const CHAR* escape = NULL;
923 while (*pattern != pattern_end && *string != string_end) {
924 if (!escape && IsWildcard(**pattern)) {
925 // We don't want to match wildcard here, except if it's escaped.
926 return;
929 // Check if the escapement char is found. If so, skip it and move to the
930 // next character.
931 if (!escape && **pattern == '\\') {
932 escape = *pattern;
933 next(pattern, pattern_end);
934 continue;
937 // Check if the chars match, if so, increment the ptrs.
938 const CHAR* pattern_next = *pattern;
939 const CHAR* string_next = *string;
940 base_icu::UChar32 pattern_char = next(&pattern_next, pattern_end);
941 if (pattern_char == next(&string_next, string_end) &&
942 pattern_char != (base_icu::UChar32) CBU_SENTINEL) {
943 *pattern = pattern_next;
944 *string = string_next;
945 } else {
946 // Uh ho, it did not match, we are done. If the last char was an
947 // escapement, that means that it was an error to advance the ptr here,
948 // let's put it back where it was. This also mean that the MatchPattern
949 // function will return false because if we can't match an escape char
950 // here, then no one will.
951 if (escape) {
952 *pattern = escape;
954 return;
957 escape = NULL;
961 template <typename CHAR, typename NEXT>
962 static void EatWildcard(const CHAR** pattern, const CHAR* end, NEXT next) {
963 while (*pattern != end) {
964 if (!IsWildcard(**pattern))
965 return;
966 next(pattern, end);
970 template <typename CHAR, typename NEXT>
971 static bool MatchPatternT(const CHAR* eval, const CHAR* eval_end,
972 const CHAR* pattern, const CHAR* pattern_end,
973 int depth,
974 NEXT next) {
975 const int kMaxDepth = 16;
976 if (depth > kMaxDepth)
977 return false;
979 // Eat all the matching chars.
980 EatSameChars(&pattern, pattern_end, &eval, eval_end, next);
982 // If the string is empty, then the pattern must be empty too, or contains
983 // only wildcards.
984 if (eval == eval_end) {
985 EatWildcard(&pattern, pattern_end, next);
986 return pattern == pattern_end;
989 // Pattern is empty but not string, this is not a match.
990 if (pattern == pattern_end)
991 return false;
993 // If this is a question mark, then we need to compare the rest with
994 // the current string or the string with one character eaten.
995 const CHAR* next_pattern = pattern;
996 next(&next_pattern, pattern_end);
997 if (pattern[0] == '?') {
998 if (MatchPatternT(eval, eval_end, next_pattern, pattern_end,
999 depth + 1, next))
1000 return true;
1001 const CHAR* next_eval = eval;
1002 next(&next_eval, eval_end);
1003 if (MatchPatternT(next_eval, eval_end, next_pattern, pattern_end,
1004 depth + 1, next))
1005 return true;
1008 // This is a *, try to match all the possible substrings with the remainder
1009 // of the pattern.
1010 if (pattern[0] == '*') {
1011 // Collapse duplicate wild cards (********** into *) so that the
1012 // method does not recurse unnecessarily. http://crbug.com/52839
1013 EatWildcard(&next_pattern, pattern_end, next);
1015 while (eval != eval_end) {
1016 if (MatchPatternT(eval, eval_end, next_pattern, pattern_end,
1017 depth + 1, next))
1018 return true;
1019 eval++;
1022 // We reached the end of the string, let see if the pattern contains only
1023 // wildcards.
1024 if (eval == eval_end) {
1025 EatWildcard(&pattern, pattern_end, next);
1026 if (pattern != pattern_end)
1027 return false;
1028 return true;
1032 return false;
1035 struct NextCharUTF8 {
1036 base_icu::UChar32 operator()(const char** p, const char* end) {
1037 base_icu::UChar32 c;
1038 int offset = 0;
1039 CBU8_NEXT(*p, offset, end - *p, c);
1040 *p += offset;
1041 return c;
1045 struct NextCharUTF16 {
1046 base_icu::UChar32 operator()(const char16** p, const char16* end) {
1047 base_icu::UChar32 c;
1048 int offset = 0;
1049 CBU16_NEXT(*p, offset, end - *p, c);
1050 *p += offset;
1051 return c;
1055 bool MatchPattern(const base::StringPiece& eval,
1056 const base::StringPiece& pattern) {
1057 return MatchPatternT(eval.data(), eval.data() + eval.size(),
1058 pattern.data(), pattern.data() + pattern.size(),
1059 0, NextCharUTF8());
1062 bool MatchPattern(const string16& eval, const string16& pattern) {
1063 return MatchPatternT(eval.c_str(), eval.c_str() + eval.size(),
1064 pattern.c_str(), pattern.c_str() + pattern.size(),
1065 0, NextCharUTF16());
1068 // The following code is compatible with the OpenBSD lcpy interface. See:
1069 // http://www.gratisoft.us/todd/papers/strlcpy.html
1070 // ftp://ftp.openbsd.org/pub/OpenBSD/src/lib/libc/string/{wcs,str}lcpy.c
1072 namespace {
1074 template <typename CHAR>
1075 size_t lcpyT(CHAR* dst, const CHAR* src, size_t dst_size) {
1076 for (size_t i = 0; i < dst_size; ++i) {
1077 if ((dst[i] = src[i]) == 0) // We hit and copied the terminating NULL.
1078 return i;
1081 // We were left off at dst_size. We over copied 1 byte. Null terminate.
1082 if (dst_size != 0)
1083 dst[dst_size - 1] = 0;
1085 // Count the rest of the |src|, and return it's length in characters.
1086 while (src[dst_size]) ++dst_size;
1087 return dst_size;
1090 } // namespace
1092 size_t base::strlcpy(char* dst, const char* src, size_t dst_size) {
1093 return lcpyT<char>(dst, src, dst_size);
1095 size_t base::wcslcpy(wchar_t* dst, const wchar_t* src, size_t dst_size) {
1096 return lcpyT<wchar_t>(dst, src, dst_size);