1 // -*- c-basic-offset: 2 -*-
3 * This file is part of the KDE libraries
4 * Copyright (C) 1999-2000 Harri Porten (porten@kde.org)
5 * Copyright (C) 2004, 2005, 2006, 2007 Apple Inc. All rights reserved.
6 * Copyright (C) 2007 Cameron Zwarich (cwzwarich@uwaterloo.ca)
8 * This library is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Library General Public
10 * License as published by the Free Software Foundation; either
11 * version 2 of the License, or (at your option) any later version.
13 * This library is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Library General Public License for more details.
18 * You should have received a copy of the GNU Library General Public License
19 * along with this library; see the file COPYING.LIB. If not, write to
20 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
21 * Boston, MA 02110-1301, USA.
31 #include "wtf/DisallowCType.h"
32 #include "wtf/ASCIICType.h"
41 #include "operations.h"
43 #include "identifier.h"
46 #include "collector.h"
48 #include <wtf/Vector.h>
52 // GCC cstring uses these automatically, but not all implementations do.
63 extern const double NaN
;
64 extern const double Inf
;
66 static inline size_t overflowIndicator() { return std::numeric_limits
<size_t>::max(); }
67 static inline size_t maxUChars() { return std::numeric_limits
<size_t>::max() / sizeof(UChar
); }
69 static inline UChar
* allocChars(size_t length
)
72 if (length
> maxUChars())
74 return static_cast<UChar
*>(fastMalloc(sizeof(UChar
) * length
));
77 static inline UChar
* reallocChars(UChar
* buffer
, size_t length
)
80 if (length
> maxUChars())
82 return static_cast<UChar
*>(fastRealloc(buffer
, sizeof(UChar
) * length
));
85 CString::CString(const char *c
)
88 data
= new char[length
+1];
89 memcpy(data
, c
, length
+ 1);
92 CString::CString(const char *c
, size_t len
)
95 data
= new char[len
+1];
100 CString::CString(const CString
&b
)
103 if (length
> 0 && b
.data
) {
104 data
= new char[length
+1];
105 memcpy(data
, b
.data
, length
+ 1);
116 CString
&CString::operator=(const char *c
)
121 data
= new char[length
+1];
122 memcpy(data
, c
, length
+ 1);
127 CString
&CString::operator=(const CString
&str
)
136 data
= new char[length
+ 1];
137 memcpy(data
, str
.data
, length
+ 1);
145 bool operator==(const CString
& c1
, const CString
& c2
)
147 size_t len
= c1
.size();
148 return len
== c2
.size() && (len
== 0 || memcmp(c1
.c_str(), c2
.c_str(), len
) == 0);
151 // Hack here to avoid a global with a constructor; point to an unsigned short instead of a UChar.
152 static unsigned short almostUChar
;
153 UString::Rep
UString::Rep::null
= { 0, 0, 1, 0, 0, &UString::Rep::null
, 0, 0, 0, 0, 0, 0 };
154 UString::Rep
UString::Rep::empty
= { 0, 0, 1, 0, 0, &UString::Rep::empty
, 0, reinterpret_cast<UChar
*>(&almostUChar
), 0, 0, 0, 0 };
155 const int normalStatBufferSize
= 4096;
156 static char *statBuffer
= 0; // FIXME: This buffer is never deallocated.
157 static int statBufferSize
= 0;
159 PassRefPtr
<UString::Rep
> UString::Rep::createCopying (const UChar
* d
, int length
)
161 UChar
* copyD
= allocChars(length
);
162 memcpy(copyD
, d
, length
* sizeof(UChar
));
164 return create(copyD
, length
);
167 PassRefPtr
<UString::Rep
> UString::Rep::create(UChar
*d
, int l
)
180 r
->usedPreCapacity
= 0;
183 // steal the single reference this Rep was created with
187 PassRefPtr
<UString::Rep
> UString::Rep::create(PassRefPtr
<Rep
> base
, int offset
, int length
)
191 int baseOffset
= base
->offset
;
193 base
= base
->baseString
;
195 assert(-(offset
+ baseOffset
) <= base
->usedPreCapacity
);
196 assert(offset
+ baseOffset
+ length
<= base
->usedCapacity
);
199 r
->offset
= baseOffset
+ offset
;
204 r
->baseString
= base
.releaseRef();
209 r
->usedPreCapacity
= 0;
212 // steal the single reference this Rep was created with
216 void UString::Rep::destroy()
219 Identifier::remove(this);
220 if (baseString
!= this) {
228 // Golden ratio - arbitrary start value to avoid mapping all 0's to all 0's
229 // or anything like that.
230 const unsigned PHI
= 0x9e3779b9U
;
232 // Paul Hsieh's SuperFastHash
233 // http://www.azillionmonkeys.com/qed/hash.html
234 unsigned UString::Rep::computeHash(const UChar
*s
, int len
)
246 tmp
= (s
[1].uc
<< 11) ^ hash
;
247 hash
= (hash
<< 16) ^ tmp
;
259 // Force "avalanching" of final 127 bits
266 // this avoids ever returning a hash code of 0, since that is used to
267 // signal "hash not computed yet", using a value that is likely to be
268 // effectively the same as 0 when the low bits are masked
275 // Paul Hsieh's SuperFastHash
276 // http://www.azillionmonkeys.com/qed/hash.html
277 unsigned UString::Rep::computeHash(const char* s
, int len
)
279 // This hash is designed to work on 16-bit chunks at a time. But since the normal case
280 // (above) is to hash UTF-16 characters, we just treat the 8-bit chars as if they
281 // were 16-bit chunks, which should give matching results
292 hash
+= (unsigned char)s
[0];
293 tmp
= ((unsigned char)s
[1] << 11) ^ hash
;
294 hash
= (hash
<< 16) ^ tmp
;
301 hash
+= (unsigned char)s
[0];
306 // Force "avalanching" of final 127 bits
313 // this avoids ever returning a hash code of 0, since that is used to
314 // signal "hash not computed yet", using a value that is likely to be
315 // effectively the same as 0 when the low bits are masked
322 unsigned UString::Rep::computeHash(const char* s
)
324 return computeHash(s
, strlen(s
));
327 // put these early so they can be inlined
328 inline size_t UString::expandedSize(size_t size
, size_t otherSize
) const
330 // Do the size calculation in two parts, returning overflowIndicator if
331 // we overflow the maximum value that we can handle.
333 if (size
> maxUChars())
334 return overflowIndicator();
336 size_t expandedSize
= ((size
+ 10) / 10 * 11) + 1;
337 if (maxUChars() - expandedSize
< otherSize
)
338 return overflowIndicator();
340 return expandedSize
+ otherSize
;
343 inline int UString::usedCapacity() const
345 return m_rep
->baseString
->usedCapacity
;
348 inline int UString::usedPreCapacity() const
350 return m_rep
->baseString
->usedPreCapacity
;
353 void UString::expandCapacity(int requiredLength
)
355 Rep
* r
= m_rep
->baseString
;
357 if (requiredLength
> r
->capacity
) {
358 size_t newCapacity
= expandedSize(requiredLength
, r
->preCapacity
);
359 UChar
* oldBuf
= r
->buf
;
360 r
->buf
= reallocChars(r
->buf
, newCapacity
);
366 r
->capacity
= newCapacity
- r
->preCapacity
;
368 if (requiredLength
> r
->usedCapacity
) {
369 r
->usedCapacity
= requiredLength
;
373 void UString::expandPreCapacity(int requiredPreCap
)
375 Rep
* r
= m_rep
->baseString
;
377 if (requiredPreCap
> r
->preCapacity
) {
378 size_t newCapacity
= expandedSize(requiredPreCap
, r
->capacity
);
379 int delta
= newCapacity
- r
->capacity
- r
->preCapacity
;
381 UChar
* newBuf
= allocChars(newCapacity
);
386 memcpy(newBuf
+ delta
, r
->buf
, (r
->capacity
+ r
->preCapacity
) * sizeof(UChar
));
390 r
->preCapacity
= newCapacity
- r
->capacity
;
392 if (requiredPreCap
> r
->usedPreCapacity
) {
393 r
->usedPreCapacity
= requiredPreCap
;
398 UString::UString(Empty
)
403 UString::UString(char c
)
404 : m_rep(Rep::create(allocChars(1), 1))
406 m_rep
->buf
[0] = static_cast<unsigned char>(c
);
409 UString::UString(const char* c
)
421 size_t length
= strlen(c
);
422 UChar
*d
= allocChars(length
);
426 for (size_t i
= 0; i
< length
; i
++)
428 m_rep
= Rep::create(d
, static_cast<int>(length
));
432 UString::UString(const char* c
, size_t length
)
444 UChar
* d
= allocChars(length
);
448 for (size_t i
= 0; i
< length
; i
++)
450 m_rep
= Rep::create(d
, static_cast<int>(length
));
454 UString::UString(const UChar
* c
, int length
)
459 m_rep
= Rep::createCopying(c
, length
);
462 UString::UString(UChar
* c
, int length
, bool copy
)
467 m_rep
= Rep::createCopying(c
, length
);
469 m_rep
= Rep::create(c
, length
);
472 UString::UString(const Vector
<UChar
>& buffer
)
477 m_rep
= Rep::createCopying(buffer
.data(), buffer
.size());
481 UString::UString(const UString
&a
, const UString
&b
)
483 int aSize
= a
.size();
484 int aOffset
= a
.m_rep
->offset
;
485 int bSize
= b
.size();
486 int bOffset
= b
.m_rep
->offset
;
487 int length
= aSize
+ bSize
;
494 } else if (bSize
== 0) {
497 } else if (aOffset
+ aSize
== a
.usedCapacity() && aSize
>= minShareSize
&& 4 * aSize
>= bSize
&&
498 (-bOffset
!= b
.usedPreCapacity() || aSize
>= bSize
)) {
499 // - a reaches the end of its buffer so it qualifies for shared append
500 // - also, it's at least a quarter the length of b - appending to a much shorter
501 // string does more harm than good
502 // - however, if b qualifies for prepend and is longer than a, we'd rather prepend
504 x
.expandCapacity(aOffset
+ length
);
505 if (a
.data() && x
.data()) {
506 memcpy(const_cast<UChar
*>(a
.data() + aSize
), b
.data(), bSize
* sizeof(UChar
));
507 m_rep
= Rep::create(a
.m_rep
, 0, length
);
510 } else if (-bOffset
== b
.usedPreCapacity() && bSize
>= minShareSize
&& 4 * bSize
>= aSize
) {
511 // - b reaches the beginning of its buffer so it qualifies for shared prepend
512 // - also, it's at least a quarter the length of a - prepending to a much shorter
513 // string does more harm than good
515 y
.expandPreCapacity(-bOffset
+ aSize
);
516 if (b
.data() && y
.data()) {
517 memcpy(const_cast<UChar
*>(b
.data() - aSize
), a
.data(), aSize
* sizeof(UChar
));
518 m_rep
= Rep::create(b
.m_rep
, -aSize
, length
);
522 // a does not qualify for append, and b does not qualify for prepend, gotta make a whole new string
523 size_t newCapacity
= expandedSize(length
, 0);
524 UChar
* d
= allocChars(newCapacity
);
528 memcpy(d
, a
.data(), aSize
* sizeof(UChar
));
529 memcpy(d
+ aSize
, b
.data(), bSize
* sizeof(UChar
));
530 m_rep
= Rep::create(d
, length
);
531 m_rep
->capacity
= newCapacity
;
536 const UString
&UString::null()
538 static UString
* n
= new UString
;
542 UString
UString::from(int i
)
544 UChar buf
[1 + sizeof(i
) * 3];
545 UChar
*end
= buf
+ sizeof(buf
) / sizeof(UChar
);
550 } else if (i
== INT_MIN
) {
551 char minBuf
[1 + sizeof(i
) * 3];
552 sprintf(minBuf
, "%d", INT_MIN
);
553 return UString(minBuf
);
555 bool negative
= false;
561 *--p
= (unsigned short)((i
% 10) + '0');
569 return UString(p
, static_cast<int>(end
- p
));
572 UString
UString::from(unsigned int u
)
574 UChar buf
[sizeof(u
) * 3];
575 UChar
*end
= buf
+ sizeof(buf
) / sizeof(UChar
);
582 *--p
= (unsigned short)((u
% 10) + '0');
587 return UString(p
, static_cast<int>(end
- p
));
590 UString
UString::from(long l
)
592 UChar buf
[1 + sizeof(l
) * 3];
593 UChar
*end
= buf
+ sizeof(buf
) / sizeof(UChar
);
598 } else if (l
== LONG_MIN
) {
599 char minBuf
[1 + sizeof(l
) * 3];
600 sprintf(minBuf
, "%ld", LONG_MIN
);
601 return UString(minBuf
);
603 bool negative
= false;
609 *--p
= (unsigned short)((l
% 10) + '0');
617 return UString(p
, static_cast<int>(end
- p
));
620 UString
UString::from(double d
)
622 // avoid ever printing -NaN, in JS conceptually there is only one NaN value
624 return UString("NaN", 3);
630 char *result
= kjs_dtoa(d
, 0, 0, &decimalPoint
, &sign
, NULL
);
631 int length
= static_cast<int>(strlen(result
));
638 if (decimalPoint
<= 0 && decimalPoint
> -6) {
641 for (int j
= decimalPoint
; j
< 0; j
++) {
644 strcpy(buf
+ i
, result
);
646 } else if (decimalPoint
<= 21 && decimalPoint
> 0) {
647 if (length
<= decimalPoint
) {
648 strcpy(buf
+ i
, result
);
650 for (int j
= 0; j
< decimalPoint
- length
; j
++) {
655 strncpy(buf
+ i
, result
, decimalPoint
);
658 strcpy(buf
+ i
, result
+ decimalPoint
);
659 i
+= length
- decimalPoint
;
661 } else if (result
[0] < '0' || result
[0] > '9') {
662 strcpy(buf
+ i
, result
);
665 buf
[i
++] = result
[0];
668 strcpy(buf
+ i
, result
+ 1);
673 buf
[i
++] = (decimalPoint
>= 0) ? '+' : '-';
674 // decimalPoint can't be more than 3 digits decimal given the
675 // nature of float representation
676 int exponential
= decimalPoint
- 1;
677 if (exponential
< 0) {
678 exponential
= exponential
* -1;
680 if (exponential
>= 100) {
681 buf
[i
++] = '0' + exponential
/ 100;
683 if (exponential
>= 10) {
684 buf
[i
++] = '0' + (exponential
% 100) / 10;
686 buf
[i
++] = '0' + exponential
% 10;
690 kjs_freedtoa(result
);
692 return UString(buf
, i
);
695 UString
UString::spliceSubstringsWithSeparators(const Range
*substringRanges
, int rangeCount
, const UString
*separators
, int separatorCount
) const
697 if (rangeCount
== 1 && separatorCount
== 0) {
698 int thisSize
= size();
699 int position
= substringRanges
[0].position
;
700 int length
= substringRanges
[0].length
;
701 if (position
<= 0 && length
>= thisSize
)
703 return UString::Rep::create(m_rep
, maxInt(0, position
), minInt(thisSize
, length
));
707 for (int i
= 0; i
< rangeCount
; i
++)
708 totalLength
+= substringRanges
[i
].length
;
709 for (int i
= 0; i
< separatorCount
; i
++)
710 totalLength
+= separators
[i
].size();
712 if (totalLength
== 0)
715 UChar
* buffer
= allocChars(totalLength
);
719 int maxCount
= max(rangeCount
, separatorCount
);
721 for (int i
= 0; i
< maxCount
; i
++) {
722 if (i
< rangeCount
) {
723 memcpy(buffer
+ bufferPos
, data() + substringRanges
[i
].position
, substringRanges
[i
].length
* sizeof(UChar
));
724 bufferPos
+= substringRanges
[i
].length
;
726 if (i
< separatorCount
) {
727 memcpy(buffer
+ bufferPos
, separators
[i
].data(), separators
[i
].size() * sizeof(UChar
));
728 bufferPos
+= separators
[i
].size();
732 return UString::Rep::create(buffer
, totalLength
);
735 // Append a sub-string of <subStr> to this string.
736 // Equivalent to append(subStr.substr(subPos, subLength))
738 UString
& UString::append(const UString
& subStr
, int subPos
, int subLength
)
740 int subSize
= subStr
.size();
744 else if (subPos
>= subSize
)
748 if (subPos
+ subLength
>= subSize
)
749 subLength
= subSize
- subPos
;
751 return append(UString(subStr
.data() + subPos
, subLength
));
754 UString
&UString::append(const UString
&t
)
756 int thisSize
= size();
757 int thisOffset
= m_rep
->offset
;
758 int tSize
= t
.size();
759 int length
= thisSize
+ tSize
;
765 } else if (tSize
== 0) {
767 } else if (m_rep
->baseIsSelf() && m_rep
->rc
== 1) {
768 // this is direct and has refcount of 1 (so we can just alter it directly)
769 expandCapacity(thisOffset
+ length
);
771 memcpy(const_cast<UChar
*>(data() + thisSize
), t
.data(), tSize
* sizeof(UChar
));
775 } else if (thisOffset
+ thisSize
== usedCapacity() && thisSize
>= minShareSize
) {
776 // this reaches the end of the buffer - extend it if it's long enough to append to
777 expandCapacity(thisOffset
+ length
);
779 memcpy(const_cast<UChar
*>(data() + thisSize
), t
.data(), tSize
* sizeof(UChar
));
780 m_rep
= Rep::create(m_rep
, 0, length
);
783 // this is shared with someone using more capacity, gotta make a whole new string
784 size_t newCapacity
= expandedSize(length
, 0);
785 UChar
* d
= allocChars(newCapacity
);
789 memcpy(d
, data(), thisSize
* sizeof(UChar
));
790 memcpy(const_cast<UChar
*>(d
+ thisSize
), t
.data(), tSize
* sizeof(UChar
));
791 m_rep
= Rep::create(d
, length
);
792 m_rep
->capacity
= newCapacity
;
800 UString
&UString::append(const char *t
)
802 int thisSize
= size();
803 int thisOffset
= m_rep
->offset
;
804 int tSize
= static_cast<int>(strlen(t
));
805 int length
= thisSize
+ tSize
;
811 } else if (tSize
== 0) {
812 // t is empty, we'll just return *this below.
813 } else if (m_rep
->baseIsSelf() && m_rep
->rc
== 1) {
814 // this is direct and has refcount of 1 (so we can just alter it directly)
815 expandCapacity(thisOffset
+ length
);
816 UChar
*d
= const_cast<UChar
*>(data());
818 for (int i
= 0; i
< tSize
; ++i
)
819 d
[thisSize
+ i
] = t
[i
];
823 } else if (thisOffset
+ thisSize
== usedCapacity() && thisSize
>= minShareSize
) {
824 // this string reaches the end of the buffer - extend it
825 expandCapacity(thisOffset
+ length
);
826 UChar
*d
= const_cast<UChar
*>(data());
828 for (int i
= 0; i
< tSize
; ++i
)
829 d
[thisSize
+ i
] = t
[i
];
830 m_rep
= Rep::create(m_rep
, 0, length
);
833 // this is shared with someone using more capacity, gotta make a whole new string
834 size_t newCapacity
= expandedSize(length
, 0);
835 UChar
* d
= allocChars(newCapacity
);
839 memcpy(d
, data(), thisSize
* sizeof(UChar
));
840 for (int i
= 0; i
< tSize
; ++i
)
841 d
[thisSize
+ i
] = t
[i
];
842 m_rep
= Rep::create(d
, length
);
843 m_rep
->capacity
= newCapacity
;
850 UString
&UString::append(unsigned short c
)
852 int thisOffset
= m_rep
->offset
;
857 // this is empty - must make a new m_rep because we don't want to pollute the shared empty one
858 size_t newCapacity
= expandedSize(1, 0);
859 UChar
* d
= allocChars(newCapacity
);
864 m_rep
= Rep::create(d
, 1);
865 m_rep
->capacity
= newCapacity
;
867 } else if (m_rep
->baseIsSelf() && m_rep
->rc
== 1) {
868 // this is direct and has refcount of 1 (so we can just alter it directly)
869 expandCapacity(thisOffset
+ length
+ 1);
870 UChar
*d
= const_cast<UChar
*>(data());
873 m_rep
->len
= length
+ 1;
876 } else if (thisOffset
+ length
== usedCapacity() && length
>= minShareSize
) {
877 // this reaches the end of the string - extend it and share
878 expandCapacity(thisOffset
+ length
+ 1);
879 UChar
*d
= const_cast<UChar
*>(data());
882 m_rep
= Rep::create(m_rep
, 0, length
+ 1);
885 // this is shared with someone using more capacity, gotta make a whole new string
886 size_t newCapacity
= expandedSize(length
+ 1, 0);
887 UChar
* d
= allocChars(newCapacity
);
891 memcpy(d
, data(), length
* sizeof(UChar
));
893 m_rep
= Rep::create(d
, length
+ 1);
894 m_rep
->capacity
= newCapacity
;
901 CString
UString::cstring() const
906 char *UString::ascii() const
908 // Never make the buffer smaller than normalStatBufferSize.
909 // Thus we almost never need to reallocate.
911 int neededSize
= length
+ 1;
912 if (neededSize
< normalStatBufferSize
) {
913 neededSize
= normalStatBufferSize
;
915 if (neededSize
!= statBufferSize
) {
916 delete [] statBuffer
;
917 statBuffer
= new char [neededSize
];
918 statBufferSize
= neededSize
;
921 const UChar
*p
= data();
922 char *q
= statBuffer
;
923 const UChar
*limit
= p
+ length
;
925 *q
= static_cast<char>(p
->uc
);
934 UString
& UString::operator=(Empty
)
941 UString
& UString::operator=(const char* c
)
943 set(c
, c
? strlen(c
) : 0);
948 void UString::set(const char* c
, int l
)
961 if (m_rep
->rc
== 1 && l
<= m_rep
->capacity
&& m_rep
->baseIsSelf() && m_rep
->offset
== 0 && m_rep
->preCapacity
== 0) {
971 m_rep
= Rep::create(d
, l
);
973 for (int i
= 0; i
< l
; i
++)
974 d
[i
].uc
= static_cast<unsigned char>(c
[i
]);
977 bool UString::is8Bit() const
979 const UChar
*u
= data();
980 const UChar
*limit
= u
+ size();
990 const UChar
UString::operator[](int pos
) const
997 double UString::toDouble(bool tolerateTrailingJunk
, bool tolerateEmptyString
) const
1001 // FIXME: If tolerateTrailingJunk is true, then we want to tolerate non-8-bit junk
1002 // after the number, so is8Bit is too strict a check.
1006 const char *c
= ascii();
1008 // skip leading white space
1009 while (isASCIISpace(*c
))
1014 return tolerateEmptyString
? 0.0 : NaN
;
1017 if (*c
== '0' && (*(c
+1) == 'x' || *(c
+1) == 'X')) {
1018 const char* firstDigitPosition
= c
+ 2;
1022 if (*c
>= '0' && *c
<= '9')
1023 d
= d
* 16.0 + *c
- '0';
1024 else if ((*c
>= 'A' && *c
<= 'F') || (*c
>= 'a' && *c
<= 'f'))
1025 d
= d
* 16.0 + (*c
& 0xdf) - 'A' + 10.0;
1030 if (d
>= mantissaOverflowLowerBound
)
1031 d
= parseIntOverflow(firstDigitPosition
, c
- firstDigitPosition
, 16);
1035 d
= kjs_strtod(c
, &end
);
1036 if ((d
!= 0.0 || end
!= c
) && d
!= Inf
&& d
!= -Inf
) {
1043 else if (*c
== '-') {
1048 // We used strtod() to do the conversion. However, strtod() handles
1049 // infinite values slightly differently than JavaScript in that it
1050 // converts the string "inf" with any capitalization to infinity,
1051 // whereas the ECMA spec requires that it be converted to NaN.
1053 if (strncmp(c
, "Infinity", 8) == 0) {
1056 } else if ((d
== Inf
|| d
== -Inf
) && *c
!= 'I' && *c
!= 'i')
1063 // allow trailing white space
1064 while (isASCIISpace(*c
))
1066 // don't allow anything after - unless tolerant=true
1067 if (!tolerateTrailingJunk
&& *c
!= '\0')
1073 double UString::toDouble(bool tolerateTrailingJunk
) const
1075 return toDouble(tolerateTrailingJunk
, true);
1078 double UString::toDouble() const
1080 return toDouble(false, true);
1083 uint32_t UString::toUInt32(bool *ok
) const
1085 double d
= toDouble();
1088 if (d
!= static_cast<uint32_t>(d
)) {
1096 return static_cast<uint32_t>(d
);
1099 uint32_t UString::toUInt32(bool *ok
, bool tolerateEmptyString
) const
1101 double d
= toDouble(false, tolerateEmptyString
);
1104 if (d
!= static_cast<uint32_t>(d
)) {
1112 return static_cast<uint32_t>(d
);
1115 uint32_t UString::toStrictUInt32(bool *ok
) const
1120 // Empty string is not OK.
1121 int len
= m_rep
->len
;
1124 const UChar
*p
= m_rep
->data();
1125 unsigned short c
= p
->unicode();
1127 // If the first digit is 0, only 0 itself is OK.
1134 // Convert to UInt32, checking for overflow.
1137 // Process character, turning it into a digit.
1138 if (c
< '0' || c
> '9')
1140 const unsigned d
= c
- '0';
1142 // Multiply by 10, checking for overflow out of 32 bits.
1143 if (i
> 0xFFFFFFFFU
/ 10)
1147 // Add in the digit, checking for overflow out of 32 bits.
1148 const unsigned max
= 0xFFFFFFFFU
- d
;
1153 // Handle end of string.
1160 // Get next character.
1161 c
= (++p
)->unicode();
1165 int UString::find(const UString
&f
, int pos
) const
1175 const UChar
* data_
= data();
1176 const UChar
* end
= data_
+ sz
- fsz
;
1177 int fsizeminusone
= (fsz
- 1) * sizeof(UChar
);
1178 const UChar
*fdata
= f
.data();
1179 unsigned short fchar
= fdata
->uc
;
1181 for (const UChar
* c
= data_
+ pos
; c
<= end
; c
++)
1182 if (c
->uc
== fchar
&& !memcmp(c
+ 1, fdata
, fsizeminusone
))
1188 int UString::find(UChar ch
, int pos
) const
1192 const UChar
* data_
= data();
1193 const UChar
*end
= data_
+ size();
1194 for (const UChar
*c
= data_
+ pos
; c
< end
; c
++)
1201 int UString::rfind(const UString
&f
, int pos
) const
1213 int fsizeminusone
= (fsz
- 1) * sizeof(UChar
);
1214 const UChar
*fdata
= f
.data();
1215 const UChar
* data_
= data();
1216 for (const UChar
* c
= data_
+ pos
; c
>= data_
; c
--) {
1217 if (*c
== *fdata
&& !memcmp(c
+ 1, fdata
+ 1, fsizeminusone
))
1224 int UString::rfind(UChar ch
, int pos
) const
1228 if (pos
+ 1 >= size())
1230 const UChar
* data_
= data();
1231 for (const UChar
* c
= data_
+ pos
; c
>= data_
; c
--) {
1239 UString
UString::substr(int pos
, int len
) const
1252 if (pos
== 0 && len
== s
)
1255 return UString(Rep::create(m_rep
, pos
, len
));
1258 void UString::copyForWriting()
1261 if (!l
) return; // Not going to touch anything anyway.
1262 if (m_rep
->rc
> 1 || !m_rep
->baseIsSelf()) {
1263 UChar
* n
= allocChars(l
);
1264 memcpy(n
, data(), l
* sizeof(UChar
));
1265 m_rep
= Rep::create(n
, l
);
1269 bool operator==(const UString
& s1
, const UString
& s2
)
1272 if (s1
.m_rep
== s2
.m_rep
)
1276 if (s1
.m_rep
->len
!= s2
.m_rep
->len
)
1279 return (memcmp(s1
.m_rep
->data(), s2
.m_rep
->data(),
1280 s1
.m_rep
->len
* sizeof(UChar
)) == 0);
1283 bool operator==(const UString
& s1
, const char *s2
)
1286 return s1
.isEmpty();
1289 const UChar
*u
= s1
.data();
1290 const UChar
*uend
= u
+ s1
.size();
1291 while (u
!= uend
&& *s2
) {
1292 if (u
->uc
!= (unsigned char)*s2
)
1298 return u
== uend
&& *s2
== 0;
1301 bool operator<(const UString
& s1
, const UString
& s2
)
1303 const int l1
= s1
.size();
1304 const int l2
= s2
.size();
1305 const int lmin
= l1
< l2
? l1
: l2
;
1306 const UChar
*c1
= s1
.data();
1307 const UChar
*c2
= s2
.data();
1309 while (l
< lmin
&& *c1
== *c2
) {
1315 return (c1
->uc
< c2
->uc
);
1320 int compare(const UString
& s1
, const UString
& s2
)
1322 const int l1
= s1
.size();
1323 const int l2
= s2
.size();
1324 const int lmin
= l1
< l2
? l1
: l2
;
1325 const UChar
*c1
= s1
.data();
1326 const UChar
*c2
= s2
.data();
1328 while (l
< lmin
&& *c1
== *c2
) {
1335 return (c1
->uc
> c2
->uc
) ? 1 : -1;
1340 return (l1
> l2
) ? 1 : -1;
1343 inline int inlineUTF8SequenceLengthNonASCII(char b0
)
1345 if ((b0
& 0xC0) != 0xC0)
1347 if ((b0
& 0xE0) == 0xC0)
1349 if ((b0
& 0xF0) == 0xE0)
1351 if ((b0
& 0xF8) == 0xF0)
1356 int UTF8SequenceLengthNonASCII(char b0
)
1358 return inlineUTF8SequenceLengthNonASCII(b0
);
1361 inline int inlineUTF8SequenceLength(char b0
)
1363 return (b0
& 0x80) == 0 ? 1 : UTF8SequenceLengthNonASCII(b0
);
1366 // Given a first byte, gives the length of the UTF-8 sequence it begins.
1367 // Returns 0 for bytes that are not legal starts of UTF-8 sequences.
1368 // Only allows sequences of up to 4 bytes, since that works for all Unicode characters (U-00000000 to U-0010FFFF).
1369 int UTF8SequenceLength(char b0
)
1371 return (b0
& 0x80) == 0 ? 1 : inlineUTF8SequenceLengthNonASCII(b0
);
1374 // Takes a null-terminated C-style string with a UTF-8 sequence in it and converts it to a character.
1375 // Only allows Unicode characters (U-00000000 to U-0010FFFF).
1376 // Returns -1 if the sequence is not valid (including presence of extra bytes).
1377 int decodeUTF8Sequence(const char *sequence
)
1379 // Handle 0-byte sequences (never valid).
1380 const unsigned char b0
= sequence
[0];
1381 const int length
= inlineUTF8SequenceLength(b0
);
1385 // Handle 1-byte sequences (plain ASCII).
1386 const unsigned char b1
= sequence
[1];
1393 // Handle 2-byte sequences.
1394 if ((b1
& 0xC0) != 0x80)
1396 const unsigned char b2
= sequence
[2];
1400 const int c
= ((b0
& 0x1F) << 6) | (b1
& 0x3F);
1406 // Handle 3-byte sequences.
1407 if ((b2
& 0xC0) != 0x80)
1409 const unsigned char b3
= sequence
[3];
1413 const int c
= ((b0
& 0xF) << 12) | ((b1
& 0x3F) << 6) | (b2
& 0x3F);
1416 // UTF-16 surrogates should never appear in UTF-8 data.
1417 if (c
>= 0xD800 && c
<= 0xDFFF)
1419 // Backwards BOM and U+FFFF should never appear in UTF-8 data.
1420 if (c
== 0xFFFE || c
== 0xFFFF)
1425 // Handle 4-byte sequences.
1426 if ((b3
& 0xC0) != 0x80)
1428 const unsigned char b4
= sequence
[4];
1432 const int c
= ((b0
& 0x7) << 18) | ((b1
& 0x3F) << 12) | ((b2
& 0x3F) << 6) | (b3
& 0x3F);
1433 if (c
< 0x10000 || c
> 0x10FFFF)
1441 CString
UString::UTF8String() const
1443 // Allocate a buffer big enough to hold all the characters.
1444 const int length
= size();
1445 Vector
<char, 1024> buffer(length
* 3);
1447 // Convert to runs of 8-bit characters.
1448 char *p
= buffer
.begin();
1449 const unsigned short* d
= &data()->uc
;
1450 for (int i
= 0; i
!= length
; ++i
) {
1451 unsigned int c
= d
[i
], sc
;
1454 } else if (c
< 0x800) {
1455 *p
++ = (char)((c
>> 6) | 0xC0); // C0 is the 2-byte flag for UTF-8
1456 *p
++ = (char)((c
| 0x80) & 0xBF); // next 6 bits, with high bit set
1457 } else if (c
>= 0xD800 && c
<= 0xDBFF && (i
+1) < length
&&
1458 (sc
= d
[i
+1]) >= 0xDC00 && sc
<= 0xDFFF) {
1459 sc
= 0x10000 + (((c
& 0x3FF) << 10) | (sc
& 0x3FF));
1460 *p
++ = (char)((sc
>> 18) | 0xF0); // F0 is the 4-byte flag for UTF-8
1461 *p
++ = (char)(((sc
>> 12) | 0x80) & 0xBF); // next 6 bits, with high bit set
1462 *p
++ = (char)(((sc
>> 6) | 0x80) & 0xBF); // next 6 bits, with high bit set
1463 *p
++ = (char)((sc
| 0x80) & 0xBF); // next 6 bits, with high bit set
1466 *p
++ = (char)((c
>> 12) | 0xE0); // E0 is the 3-byte flag for UTF-8
1467 *p
++ = (char)(((c
>> 6) | 0x80) & 0xBF); // next 6 bits, with high bit set
1468 *p
++ = (char)((c
| 0x80) & 0xBF); // next 6 bits, with high bit set
1472 // Return the result as a C string.
1473 CString
result(buffer
.data(), p
- buffer
.data());