fix logic
[personal-kdelibs.git] / kjs / ustring.cpp
blob65568cd107ead62e911c5b2fdb550349cd439b2b
1 // -*- c-basic-offset: 2 -*-
2 /*
3 * This file is part of the KDE libraries
4 * Copyright (C) 1999-2000 Harri Porten (porten@kde.org)
5 * Copyright (C) 2004, 2005, 2006, 2007 Apple Inc. All rights reserved.
6 * Copyright (C) 2007 Cameron Zwarich (cwzwarich@uwaterloo.ca)
8 * This library is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Library General Public
10 * License as published by the Free Software Foundation; either
11 * version 2 of the License, or (at your option) any later version.
13 * This library is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Library General Public License for more details.
18 * You should have received a copy of the GNU Library General Public License
19 * along with this library; see the file COPYING.LIB. If not, write to
20 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
21 * Boston, MA 02110-1301, USA.
25 #include "ustring.h"
26 #include <config.h>
28 #include <assert.h>
29 #include <stdlib.h>
30 #include <stdio.h>
31 #include "wtf/DisallowCType.h"
32 #include "wtf/ASCIICType.h"
33 #if HAVE(STRING_H)
34 #include <string.h>
35 #endif
36 #if HAVE(STRINGS_H)
37 #include <strings.h>
38 #endif
39 #include <limits.h>
41 #include "operations.h"
42 #include "function.h"
43 #include "identifier.h"
44 #include <math.h>
45 #include "dtoa.h"
46 #include "collector.h"
48 #include <wtf/Vector.h>
50 using std::max;
52 // GCC cstring uses these automatically, but not all implementations do.
53 using std::strlen;
54 using std::strcpy;
55 using std::strncpy;
56 using std::memset;
57 using std::memcpy;
59 using namespace WTF;
61 namespace KJS {
63 extern const double NaN;
64 extern const double Inf;
66 static inline size_t overflowIndicator() { return std::numeric_limits<size_t>::max(); }
67 static inline size_t maxUChars() { return std::numeric_limits<size_t>::max() / sizeof(UChar); }
69 static inline UChar* allocChars(size_t length)
71 assert(length);
72 if (length > maxUChars())
73 return 0;
74 return static_cast<UChar*>(fastMalloc(sizeof(UChar) * length));
77 static inline UChar* reallocChars(UChar* buffer, size_t length)
79 ASSERT(length);
80 if (length > maxUChars())
81 return 0;
82 return static_cast<UChar*>(fastRealloc(buffer, sizeof(UChar) * length));
85 CString::CString(const char *c)
87 length = strlen(c);
88 data = new char[length+1];
89 memcpy(data, c, length + 1);
92 CString::CString(const char *c, size_t len)
94 length = len;
95 data = new char[len+1];
96 memcpy(data, c, len);
97 data[len] = 0;
100 CString::CString(const CString &b)
102 length = b.length;
103 if (length > 0 && b.data) {
104 data = new char[length+1];
105 memcpy(data, b.data, length + 1);
107 else
108 data = 0;
111 CString::~CString()
113 delete [] data;
116 CString &CString::operator=(const char *c)
118 if (data)
119 delete [] data;
120 length = strlen(c);
121 data = new char[length+1];
122 memcpy(data, c, length + 1);
124 return *this;
127 CString &CString::operator=(const CString &str)
129 if (this == &str)
130 return *this;
132 if (data)
133 delete [] data;
134 length = str.length;
135 if (str.data) {
136 data = new char[length + 1];
137 memcpy(data, str.data, length + 1);
139 else
140 data = 0;
142 return *this;
145 bool operator==(const CString& c1, const CString& c2)
147 size_t len = c1.size();
148 return len == c2.size() && (len == 0 || memcmp(c1.c_str(), c2.c_str(), len) == 0);
151 // Hack here to avoid a global with a constructor; point to an unsigned short instead of a UChar.
152 static unsigned short almostUChar;
153 UString::Rep UString::Rep::null = { 0, 0, 1, 0, 0, &UString::Rep::null, 0, 0, 0, 0, 0, 0 };
154 UString::Rep UString::Rep::empty = { 0, 0, 1, 0, 0, &UString::Rep::empty, 0, reinterpret_cast<UChar*>(&almostUChar), 0, 0, 0, 0 };
155 const int normalStatBufferSize = 4096;
156 static char *statBuffer = 0; // FIXME: This buffer is never deallocated.
157 static int statBufferSize = 0;
159 PassRefPtr<UString::Rep> UString::Rep::createCopying (const UChar* d, int length)
161 UChar* copyD = allocChars(length);
162 memcpy(copyD, d, length * sizeof(UChar));
164 return create(copyD, length);
167 PassRefPtr<UString::Rep> UString::Rep::create(UChar *d, int l)
169 Rep* r = new Rep;
170 r->offset = 0;
171 r->len = l;
172 r->rc = 1;
173 r->_hash = 0;
174 r->isIdentifier = 0;
175 r->baseString = r;
176 r->reportedCost = 0;
177 r->buf = d;
178 r->usedCapacity = l;
179 r->capacity = l;
180 r->usedPreCapacity = 0;
181 r->preCapacity = 0;
183 // steal the single reference this Rep was created with
184 return adoptRef(r);
187 PassRefPtr<UString::Rep> UString::Rep::create(PassRefPtr<Rep> base, int offset, int length)
189 assert(base);
191 int baseOffset = base->offset;
193 base = base->baseString;
195 assert(-(offset + baseOffset) <= base->usedPreCapacity);
196 assert(offset + baseOffset + length <= base->usedCapacity);
198 Rep* r = new Rep;
199 r->offset = baseOffset + offset;
200 r->len = length;
201 r->rc = 1;
202 r->_hash = 0;
203 r->isIdentifier = 0;
204 r->baseString = base.releaseRef();
205 r->reportedCost = 0;
206 r->buf = 0;
207 r->usedCapacity = 0;
208 r->capacity = 0;
209 r->usedPreCapacity = 0;
210 r->preCapacity = 0;
212 // steal the single reference this Rep was created with
213 return adoptRef(r);
216 void UString::Rep::destroy()
218 if (isIdentifier)
219 Identifier::remove(this);
220 if (baseString != this) {
221 baseString->deref();
222 } else {
223 fastFree(buf);
225 delete this;
228 // Golden ratio - arbitrary start value to avoid mapping all 0's to all 0's
229 // or anything like that.
230 const unsigned PHI = 0x9e3779b9U;
232 // Paul Hsieh's SuperFastHash
233 // http://www.azillionmonkeys.com/qed/hash.html
234 unsigned UString::Rep::computeHash(const UChar *s, int len)
236 unsigned l = len;
237 uint32_t hash = PHI;
238 uint32_t tmp;
240 int rem = l & 1;
241 l >>= 1;
243 // Main loop
244 for (; l > 0; l--) {
245 hash += s[0].uc;
246 tmp = (s[1].uc << 11) ^ hash;
247 hash = (hash << 16) ^ tmp;
248 s += 2;
249 hash += hash >> 11;
252 // Handle end case
253 if (rem) {
254 hash += s[0].uc;
255 hash ^= hash << 11;
256 hash += hash >> 17;
259 // Force "avalanching" of final 127 bits
260 hash ^= hash << 3;
261 hash += hash >> 5;
262 hash ^= hash << 2;
263 hash += hash >> 15;
264 hash ^= hash << 10;
266 // this avoids ever returning a hash code of 0, since that is used to
267 // signal "hash not computed yet", using a value that is likely to be
268 // effectively the same as 0 when the low bits are masked
269 if (hash == 0)
270 hash = 0x80000000;
272 return hash;
275 // Paul Hsieh's SuperFastHash
276 // http://www.azillionmonkeys.com/qed/hash.html
277 unsigned UString::Rep::computeHash(const char* s, int len)
279 // This hash is designed to work on 16-bit chunks at a time. But since the normal case
280 // (above) is to hash UTF-16 characters, we just treat the 8-bit chars as if they
281 // were 16-bit chunks, which should give matching results
283 uint32_t hash = PHI;
284 uint32_t tmp;
285 unsigned l = len;
287 int rem = l & 1;
288 l >>= 1;
290 // Main loop
291 for (; l > 0; l--) {
292 hash += (unsigned char)s[0];
293 tmp = ((unsigned char)s[1] << 11) ^ hash;
294 hash = (hash << 16) ^ tmp;
295 s += 2;
296 hash += hash >> 11;
299 // Handle end case
300 if (rem) {
301 hash += (unsigned char)s[0];
302 hash ^= hash << 11;
303 hash += hash >> 17;
306 // Force "avalanching" of final 127 bits
307 hash ^= hash << 3;
308 hash += hash >> 5;
309 hash ^= hash << 2;
310 hash += hash >> 15;
311 hash ^= hash << 10;
313 // this avoids ever returning a hash code of 0, since that is used to
314 // signal "hash not computed yet", using a value that is likely to be
315 // effectively the same as 0 when the low bits are masked
316 if (hash == 0)
317 hash = 0x80000000;
319 return hash;
322 unsigned UString::Rep::computeHash(const char* s)
324 return computeHash(s, strlen(s));
327 // put these early so they can be inlined
328 inline size_t UString::expandedSize(size_t size, size_t otherSize) const
330 // Do the size calculation in two parts, returning overflowIndicator if
331 // we overflow the maximum value that we can handle.
333 if (size > maxUChars())
334 return overflowIndicator();
336 size_t expandedSize = ((size + 10) / 10 * 11) + 1;
337 if (maxUChars() - expandedSize < otherSize)
338 return overflowIndicator();
340 return expandedSize + otherSize;
343 inline int UString::usedCapacity() const
345 return m_rep->baseString->usedCapacity;
348 inline int UString::usedPreCapacity() const
350 return m_rep->baseString->usedPreCapacity;
353 void UString::expandCapacity(int requiredLength)
355 Rep* r = m_rep->baseString;
357 if (requiredLength > r->capacity) {
358 size_t newCapacity = expandedSize(requiredLength, r->preCapacity);
359 UChar* oldBuf = r->buf;
360 r->buf = reallocChars(r->buf, newCapacity);
361 if (!r->buf) {
362 r->buf = oldBuf;
363 m_rep = &Rep::null;
364 return;
366 r->capacity = newCapacity - r->preCapacity;
368 if (requiredLength > r->usedCapacity) {
369 r->usedCapacity = requiredLength;
373 void UString::expandPreCapacity(int requiredPreCap)
375 Rep* r = m_rep->baseString;
377 if (requiredPreCap > r->preCapacity) {
378 size_t newCapacity = expandedSize(requiredPreCap, r->capacity);
379 int delta = newCapacity - r->capacity - r->preCapacity;
381 UChar* newBuf = allocChars(newCapacity);
382 if (!newBuf) {
383 m_rep = &Rep::null;
384 return;
386 memcpy(newBuf + delta, r->buf, (r->capacity + r->preCapacity) * sizeof(UChar));
387 fastFree(r->buf);
388 r->buf = newBuf;
390 r->preCapacity = newCapacity - r->capacity;
392 if (requiredPreCap > r->usedPreCapacity) {
393 r->usedPreCapacity = requiredPreCap;
398 UString::UString(Empty)
399 : m_rep(&Rep::empty)
403 UString::UString(char c)
404 : m_rep(Rep::create(allocChars(1), 1))
406 m_rep->buf[0] = static_cast<unsigned char>(c);
409 UString::UString(const char* c)
411 if (!c) {
412 m_rep = &Rep::null;
413 return;
416 if (!c[0]) {
417 m_rep = &Rep::empty;
418 return;
421 size_t length = strlen(c);
422 UChar *d = allocChars(length);
423 if (!d)
424 m_rep = &Rep::null;
425 else {
426 for (size_t i = 0; i < length; i++)
427 d[i].uc = c[i];
428 m_rep = Rep::create(d, static_cast<int>(length));
432 UString::UString(const char* c, size_t length)
434 if (!c) {
435 m_rep = &Rep::null;
436 return;
439 if (length == 0) {
440 m_rep = &Rep::empty;
441 return;
444 UChar* d = allocChars(length);
445 if (!d)
446 m_rep = &Rep::null;
447 else {
448 for (size_t i = 0; i < length; i++)
449 d[i].uc = c[i];
450 m_rep = Rep::create(d, static_cast<int>(length));
454 UString::UString(const UChar* c, int length)
456 if (length == 0)
457 m_rep = &Rep::empty;
458 else
459 m_rep = Rep::createCopying(c, length);
462 UString::UString(UChar* c, int length, bool copy)
464 if (length == 0)
465 m_rep = &Rep::empty;
466 else if (copy)
467 m_rep = Rep::createCopying(c, length);
468 else
469 m_rep = Rep::create(c, length);
472 UString::UString(const Vector<UChar>& buffer)
474 if (!buffer.size())
475 m_rep = &Rep::empty;
476 else
477 m_rep = Rep::createCopying(buffer.data(), buffer.size());
481 UString::UString(const UString &a, const UString &b)
483 int aSize = a.size();
484 int aOffset = a.m_rep->offset;
485 int bSize = b.size();
486 int bOffset = b.m_rep->offset;
487 int length = aSize + bSize;
489 // possible cases:
491 if (aSize == 0) {
492 // a is empty
493 m_rep = b.m_rep;
494 } else if (bSize == 0) {
495 // b is empty
496 m_rep = a.m_rep;
497 } else if (aOffset + aSize == a.usedCapacity() && aSize >= minShareSize && 4 * aSize >= bSize &&
498 (-bOffset != b.usedPreCapacity() || aSize >= bSize)) {
499 // - a reaches the end of its buffer so it qualifies for shared append
500 // - also, it's at least a quarter the length of b - appending to a much shorter
501 // string does more harm than good
502 // - however, if b qualifies for prepend and is longer than a, we'd rather prepend
503 UString x(a);
504 x.expandCapacity(aOffset + length);
505 if (a.data() && x.data()) {
506 memcpy(const_cast<UChar *>(a.data() + aSize), b.data(), bSize * sizeof(UChar));
507 m_rep = Rep::create(a.m_rep, 0, length);
508 } else
509 m_rep = &Rep::null;
510 } else if (-bOffset == b.usedPreCapacity() && bSize >= minShareSize && 4 * bSize >= aSize) {
511 // - b reaches the beginning of its buffer so it qualifies for shared prepend
512 // - also, it's at least a quarter the length of a - prepending to a much shorter
513 // string does more harm than good
514 UString y(b);
515 y.expandPreCapacity(-bOffset + aSize);
516 if (b.data() && y.data()) {
517 memcpy(const_cast<UChar *>(b.data() - aSize), a.data(), aSize * sizeof(UChar));
518 m_rep = Rep::create(b.m_rep, -aSize, length);
519 } else
520 m_rep = &Rep::null;
521 } else {
522 // a does not qualify for append, and b does not qualify for prepend, gotta make a whole new string
523 size_t newCapacity = expandedSize(length, 0);
524 UChar* d = allocChars(newCapacity);
525 if (!d)
526 m_rep = &Rep::null;
527 else {
528 memcpy(d, a.data(), aSize * sizeof(UChar));
529 memcpy(d + aSize, b.data(), bSize * sizeof(UChar));
530 m_rep = Rep::create(d, length);
531 m_rep->capacity = newCapacity;
536 const UString &UString::null()
538 static UString* n = new UString;
539 return *n;
542 UString UString::from(int i)
544 UChar buf[1 + sizeof(i) * 3];
545 UChar *end = buf + sizeof(buf) / sizeof(UChar);
546 UChar *p = end;
548 if (i == 0) {
549 *--p = '0';
550 } else if (i == INT_MIN) {
551 char minBuf[1 + sizeof(i) * 3];
552 sprintf(minBuf, "%d", INT_MIN);
553 return UString(minBuf);
554 } else {
555 bool negative = false;
556 if (i < 0) {
557 negative = true;
558 i = -i;
560 while (i) {
561 *--p = (unsigned short)((i % 10) + '0');
562 i /= 10;
564 if (negative) {
565 *--p = '-';
569 return UString(p, static_cast<int>(end - p));
572 UString UString::from(unsigned int u)
574 UChar buf[sizeof(u) * 3];
575 UChar *end = buf + sizeof(buf) / sizeof(UChar);
576 UChar *p = end;
578 if (u == 0) {
579 *--p = '0';
580 } else {
581 while (u) {
582 *--p = (unsigned short)((u % 10) + '0');
583 u /= 10;
587 return UString(p, static_cast<int>(end - p));
590 UString UString::from(long l)
592 UChar buf[1 + sizeof(l) * 3];
593 UChar *end = buf + sizeof(buf) / sizeof(UChar);
594 UChar *p = end;
596 if (l == 0) {
597 *--p = '0';
598 } else if (l == LONG_MIN) {
599 char minBuf[1 + sizeof(l) * 3];
600 sprintf(minBuf, "%ld", LONG_MIN);
601 return UString(minBuf);
602 } else {
603 bool negative = false;
604 if (l < 0) {
605 negative = true;
606 l = -l;
608 while (l) {
609 *--p = (unsigned short)((l % 10) + '0');
610 l /= 10;
612 if (negative) {
613 *--p = '-';
617 return UString(p, static_cast<int>(end - p));
620 UString UString::from(double d)
622 // avoid ever printing -NaN, in JS conceptually there is only one NaN value
623 if (isNaN(d))
624 return UString("NaN", 3);
626 char buf[80];
627 int decimalPoint;
628 int sign;
630 char *result = kjs_dtoa(d, 0, 0, &decimalPoint, &sign, NULL);
631 int length = static_cast<int>(strlen(result));
633 int i = 0;
634 if (sign) {
635 buf[i++] = '-';
638 if (decimalPoint <= 0 && decimalPoint > -6) {
639 buf[i++] = '0';
640 buf[i++] = '.';
641 for (int j = decimalPoint; j < 0; j++) {
642 buf[i++] = '0';
644 strcpy(buf + i, result);
645 i += length;
646 } else if (decimalPoint <= 21 && decimalPoint > 0) {
647 if (length <= decimalPoint) {
648 strcpy(buf + i, result);
649 i += length;
650 for (int j = 0; j < decimalPoint - length; j++) {
651 buf[i++] = '0';
653 // buf[i] = '\0';
654 } else {
655 strncpy(buf + i, result, decimalPoint);
656 i += decimalPoint;
657 buf[i++] = '.';
658 strcpy(buf + i, result + decimalPoint);
659 i += length - decimalPoint;
661 } else if (result[0] < '0' || result[0] > '9') {
662 strcpy(buf + i, result);
663 i += length;
664 } else {
665 buf[i++] = result[0];
666 if (length > 1) {
667 buf[i++] = '.';
668 strcpy(buf + i, result + 1);
669 i += length - 1;
672 buf[i++] = 'e';
673 buf[i++] = (decimalPoint >= 0) ? '+' : '-';
674 // decimalPoint can't be more than 3 digits decimal given the
675 // nature of float representation
676 int exponential = decimalPoint - 1;
677 if (exponential < 0) {
678 exponential = exponential * -1;
680 if (exponential >= 100) {
681 buf[i++] = '0' + exponential / 100;
683 if (exponential >= 10) {
684 buf[i++] = '0' + (exponential % 100) / 10;
686 buf[i++] = '0' + exponential % 10;
687 // buf[i++] = '\0';
690 kjs_freedtoa(result);
692 return UString(buf, i);
695 UString UString::spliceSubstringsWithSeparators(const Range *substringRanges, int rangeCount, const UString *separators, int separatorCount) const
697 if (rangeCount == 1 && separatorCount == 0) {
698 int thisSize = size();
699 int position = substringRanges[0].position;
700 int length = substringRanges[0].length;
701 if (position <= 0 && length >= thisSize)
702 return *this;
703 return UString::Rep::create(m_rep, maxInt(0, position), minInt(thisSize, length));
706 int totalLength = 0;
707 for (int i = 0; i < rangeCount; i++)
708 totalLength += substringRanges[i].length;
709 for (int i = 0; i < separatorCount; i++)
710 totalLength += separators[i].size();
712 if (totalLength == 0)
713 return "";
715 UChar* buffer = allocChars(totalLength);
716 if (!buffer)
717 return null();
719 int maxCount = max(rangeCount, separatorCount);
720 int bufferPos = 0;
721 for (int i = 0; i < maxCount; i++) {
722 if (i < rangeCount) {
723 memcpy(buffer + bufferPos, data() + substringRanges[i].position, substringRanges[i].length * sizeof(UChar));
724 bufferPos += substringRanges[i].length;
726 if (i < separatorCount) {
727 memcpy(buffer + bufferPos, separators[i].data(), separators[i].size() * sizeof(UChar));
728 bufferPos += separators[i].size();
732 return UString::Rep::create(buffer, totalLength);
735 // Append a sub-string of <subStr> to this string.
736 // Equivalent to append(subStr.substr(subPos, subLength))
738 UString& UString::append(const UString& subStr, int subPos, int subLength)
740 int subSize = subStr.size();
742 if (subPos < 0)
743 subPos = 0;
744 else if (subPos >= subSize)
745 subPos = subSize;
746 if (subLength < 0)
747 subLength = subSize;
748 if (subPos + subLength >= subSize)
749 subLength = subSize - subPos;
751 return append(UString(subStr.data() + subPos, subLength));
754 UString &UString::append(const UString &t)
756 int thisSize = size();
757 int thisOffset = m_rep->offset;
758 int tSize = t.size();
759 int length = thisSize + tSize;
761 // possible cases:
762 if (thisSize == 0) {
763 // this is empty
764 *this = t;
765 } else if (tSize == 0) {
766 // t is empty
767 } else if (m_rep->baseIsSelf() && m_rep->rc == 1) {
768 // this is direct and has refcount of 1 (so we can just alter it directly)
769 expandCapacity(thisOffset + length);
770 if (data()) {
771 memcpy(const_cast<UChar*>(data() + thisSize), t.data(), tSize * sizeof(UChar));
772 m_rep->len = length;
773 m_rep->_hash = 0;
775 } else if (thisOffset + thisSize == usedCapacity() && thisSize >= minShareSize) {
776 // this reaches the end of the buffer - extend it if it's long enough to append to
777 expandCapacity(thisOffset + length);
778 if (data()) {
779 memcpy(const_cast<UChar*>(data() + thisSize), t.data(), tSize * sizeof(UChar));
780 m_rep = Rep::create(m_rep, 0, length);
782 } else {
783 // this is shared with someone using more capacity, gotta make a whole new string
784 size_t newCapacity = expandedSize(length, 0);
785 UChar* d = allocChars(newCapacity);
786 if (!d)
787 m_rep = &Rep::null;
788 else {
789 memcpy(d, data(), thisSize * sizeof(UChar));
790 memcpy(const_cast<UChar*>(d + thisSize), t.data(), tSize * sizeof(UChar));
791 m_rep = Rep::create(d, length);
792 m_rep->capacity = newCapacity;
796 return *this;
800 UString &UString::append(const char *t)
802 int thisSize = size();
803 int thisOffset = m_rep->offset;
804 int tSize = static_cast<int>(strlen(t));
805 int length = thisSize + tSize;
807 // possible cases:
808 if (thisSize == 0) {
809 // this is empty
810 *this = t;
811 } else if (tSize == 0) {
812 // t is empty, we'll just return *this below.
813 } else if (m_rep->baseIsSelf() && m_rep->rc == 1) {
814 // this is direct and has refcount of 1 (so we can just alter it directly)
815 expandCapacity(thisOffset + length);
816 UChar *d = const_cast<UChar *>(data());
817 if (d) {
818 for (int i = 0; i < tSize; ++i)
819 d[thisSize + i] = t[i];
820 m_rep->len = length;
821 m_rep->_hash = 0;
823 } else if (thisOffset + thisSize == usedCapacity() && thisSize >= minShareSize) {
824 // this string reaches the end of the buffer - extend it
825 expandCapacity(thisOffset + length);
826 UChar *d = const_cast<UChar *>(data());
827 if (d) {
828 for (int i = 0; i < tSize; ++i)
829 d[thisSize + i] = t[i];
830 m_rep = Rep::create(m_rep, 0, length);
832 } else {
833 // this is shared with someone using more capacity, gotta make a whole new string
834 size_t newCapacity = expandedSize(length, 0);
835 UChar* d = allocChars(newCapacity);
836 if (!d)
837 m_rep = &Rep::null;
838 else {
839 memcpy(d, data(), thisSize * sizeof(UChar));
840 for (int i = 0; i < tSize; ++i)
841 d[thisSize + i] = t[i];
842 m_rep = Rep::create(d, length);
843 m_rep->capacity = newCapacity;
847 return *this;
850 UString &UString::append(unsigned short c)
852 int thisOffset = m_rep->offset;
853 int length = size();
855 // possible cases:
856 if (length == 0) {
857 // this is empty - must make a new m_rep because we don't want to pollute the shared empty one
858 size_t newCapacity = expandedSize(1, 0);
859 UChar* d = allocChars(newCapacity);
860 if (!d)
861 m_rep = &Rep::null;
862 else {
863 d[0] = c;
864 m_rep = Rep::create(d, 1);
865 m_rep->capacity = newCapacity;
867 } else if (m_rep->baseIsSelf() && m_rep->rc == 1) {
868 // this is direct and has refcount of 1 (so we can just alter it directly)
869 expandCapacity(thisOffset + length + 1);
870 UChar *d = const_cast<UChar *>(data());
871 if (d) {
872 d[length] = c;
873 m_rep->len = length + 1;
874 m_rep->_hash = 0;
876 } else if (thisOffset + length == usedCapacity() && length >= minShareSize) {
877 // this reaches the end of the string - extend it and share
878 expandCapacity(thisOffset + length + 1);
879 UChar *d = const_cast<UChar *>(data());
880 if (d) {
881 d[length] = c;
882 m_rep = Rep::create(m_rep, 0, length + 1);
884 } else {
885 // this is shared with someone using more capacity, gotta make a whole new string
886 size_t newCapacity = expandedSize(length + 1, 0);
887 UChar* d = allocChars(newCapacity);
888 if (!d)
889 m_rep = &Rep::null;
890 else {
891 memcpy(d, data(), length * sizeof(UChar));
892 d[length] = c;
893 m_rep = Rep::create(d, length + 1);
894 m_rep->capacity = newCapacity;
898 return *this;
901 CString UString::cstring() const
903 return ascii();
906 char *UString::ascii() const
908 // Never make the buffer smaller than normalStatBufferSize.
909 // Thus we almost never need to reallocate.
910 int length = size();
911 int neededSize = length + 1;
912 if (neededSize < normalStatBufferSize) {
913 neededSize = normalStatBufferSize;
915 if (neededSize != statBufferSize) {
916 delete [] statBuffer;
917 statBuffer = new char [neededSize];
918 statBufferSize = neededSize;
921 const UChar *p = data();
922 char *q = statBuffer;
923 const UChar *limit = p + length;
924 while (p != limit) {
925 *q = static_cast<char>(p->uc);
926 ++p;
927 ++q;
929 *q = '\0';
931 return statBuffer;
934 UString& UString::operator=(Empty)
936 m_rep = &Rep::empty;
938 return *this;
941 UString& UString::operator=(const char* c)
943 set(c, c ? strlen(c) : 0);
945 return *this;
948 void UString::set(const char* c, int l)
950 if (!c) {
951 m_rep = &Rep::null;
952 return;
955 if (l == 0) {
956 m_rep = &Rep::empty;
957 return;
960 UChar *d;
961 if (m_rep->rc == 1 && l <= m_rep->capacity && m_rep->baseIsSelf() && m_rep->offset == 0 && m_rep->preCapacity == 0) {
962 d = m_rep->buf;
963 m_rep->_hash = 0;
964 m_rep->len = l;
965 } else {
966 d = allocChars(l);
967 if (!d) {
968 m_rep = &Rep::null;
969 return;
971 m_rep = Rep::create(d, l);
973 for (int i = 0; i < l; i++)
974 d[i].uc = static_cast<unsigned char>(c[i]);
977 bool UString::is8Bit() const
979 const UChar *u = data();
980 const UChar *limit = u + size();
981 while (u < limit) {
982 if (u->uc > 0xFF)
983 return false;
984 ++u;
987 return true;
990 const UChar UString::operator[](int pos) const
992 if (pos >= size())
993 return '\0';
994 return data()[pos];
997 double UString::toDouble(bool tolerateTrailingJunk, bool tolerateEmptyString) const
999 double d;
1001 // FIXME: If tolerateTrailingJunk is true, then we want to tolerate non-8-bit junk
1002 // after the number, so is8Bit is too strict a check.
1003 if (!is8Bit())
1004 return NaN;
1006 const char *c = ascii();
1008 // skip leading white space
1009 while (isASCIISpace(*c))
1010 c++;
1012 // empty string ?
1013 if (*c == '\0')
1014 return tolerateEmptyString ? 0.0 : NaN;
1016 // hex number ?
1017 if (*c == '0' && (*(c+1) == 'x' || *(c+1) == 'X')) {
1018 const char* firstDigitPosition = c + 2;
1019 c++;
1020 d = 0.0;
1021 while (*(++c)) {
1022 if (*c >= '0' && *c <= '9')
1023 d = d * 16.0 + *c - '0';
1024 else if ((*c >= 'A' && *c <= 'F') || (*c >= 'a' && *c <= 'f'))
1025 d = d * 16.0 + (*c & 0xdf) - 'A' + 10.0;
1026 else
1027 break;
1030 if (d >= mantissaOverflowLowerBound)
1031 d = parseIntOverflow(firstDigitPosition, c - firstDigitPosition, 16);
1032 } else {
1033 // regular number ?
1034 char *end;
1035 d = kjs_strtod(c, &end);
1036 if ((d != 0.0 || end != c) && d != Inf && d != -Inf) {
1037 c = end;
1038 } else {
1039 double sign = 1.0;
1041 if (*c == '+')
1042 c++;
1043 else if (*c == '-') {
1044 sign = -1.0;
1045 c++;
1048 // We used strtod() to do the conversion. However, strtod() handles
1049 // infinite values slightly differently than JavaScript in that it
1050 // converts the string "inf" with any capitalization to infinity,
1051 // whereas the ECMA spec requires that it be converted to NaN.
1053 if (strncmp(c, "Infinity", 8) == 0) {
1054 d = sign * Inf;
1055 c += 8;
1056 } else if ((d == Inf || d == -Inf) && *c != 'I' && *c != 'i')
1057 c = end;
1058 else
1059 return NaN;
1063 // allow trailing white space
1064 while (isASCIISpace(*c))
1065 c++;
1066 // don't allow anything after - unless tolerant=true
1067 if (!tolerateTrailingJunk && *c != '\0')
1068 d = NaN;
1070 return d;
1073 double UString::toDouble(bool tolerateTrailingJunk) const
1075 return toDouble(tolerateTrailingJunk, true);
1078 double UString::toDouble() const
1080 return toDouble(false, true);
1083 uint32_t UString::toUInt32(bool *ok) const
1085 double d = toDouble();
1086 bool b = true;
1088 if (d != static_cast<uint32_t>(d)) {
1089 b = false;
1090 d = 0;
1093 if (ok)
1094 *ok = b;
1096 return static_cast<uint32_t>(d);
1099 uint32_t UString::toUInt32(bool *ok, bool tolerateEmptyString) const
1101 double d = toDouble(false, tolerateEmptyString);
1102 bool b = true;
1104 if (d != static_cast<uint32_t>(d)) {
1105 b = false;
1106 d = 0;
1109 if (ok)
1110 *ok = b;
1112 return static_cast<uint32_t>(d);
1115 uint32_t UString::toStrictUInt32(bool *ok) const
1117 if (ok)
1118 *ok = false;
1120 // Empty string is not OK.
1121 int len = m_rep->len;
1122 if (len == 0)
1123 return 0;
1124 const UChar *p = m_rep->data();
1125 unsigned short c = p->unicode();
1127 // If the first digit is 0, only 0 itself is OK.
1128 if (c == '0') {
1129 if (len == 1 && ok)
1130 *ok = true;
1131 return 0;
1134 // Convert to UInt32, checking for overflow.
1135 uint32_t i = 0;
1136 while (1) {
1137 // Process character, turning it into a digit.
1138 if (c < '0' || c > '9')
1139 return 0;
1140 const unsigned d = c - '0';
1142 // Multiply by 10, checking for overflow out of 32 bits.
1143 if (i > 0xFFFFFFFFU / 10)
1144 return 0;
1145 i *= 10;
1147 // Add in the digit, checking for overflow out of 32 bits.
1148 const unsigned max = 0xFFFFFFFFU - d;
1149 if (i > max)
1150 return 0;
1151 i += d;
1153 // Handle end of string.
1154 if (--len == 0) {
1155 if (ok)
1156 *ok = true;
1157 return i;
1160 // Get next character.
1161 c = (++p)->unicode();
1165 int UString::find(const UString &f, int pos) const
1167 int sz = size();
1168 int fsz = f.size();
1169 if (sz < fsz)
1170 return -1;
1171 if (pos < 0)
1172 pos = 0;
1173 if (fsz == 0)
1174 return pos;
1175 const UChar* data_ = data();
1176 const UChar* end = data_ + sz - fsz;
1177 int fsizeminusone = (fsz - 1) * sizeof(UChar);
1178 const UChar *fdata = f.data();
1179 unsigned short fchar = fdata->uc;
1180 ++fdata;
1181 for (const UChar* c = data_ + pos; c <= end; c++)
1182 if (c->uc == fchar && !memcmp(c + 1, fdata, fsizeminusone))
1183 return (c - data_);
1185 return -1;
1188 int UString::find(UChar ch, int pos) const
1190 if (pos < 0)
1191 pos = 0;
1192 const UChar* data_ = data();
1193 const UChar *end = data_ + size();
1194 for (const UChar *c = data_ + pos; c < end; c++)
1195 if (*c == ch)
1196 return (c - data_);
1198 return -1;
1201 int UString::rfind(const UString &f, int pos) const
1203 int sz = size();
1204 int fsz = f.size();
1205 if (sz < fsz)
1206 return -1;
1207 if (pos < 0)
1208 pos = 0;
1209 if (pos > sz - fsz)
1210 pos = sz - fsz;
1211 if (fsz == 0)
1212 return pos;
1213 int fsizeminusone = (fsz - 1) * sizeof(UChar);
1214 const UChar *fdata = f.data();
1215 const UChar* data_ = data();
1216 for (const UChar* c = data_ + pos; c >= data_; c--) {
1217 if (*c == *fdata && !memcmp(c + 1, fdata + 1, fsizeminusone))
1218 return (c - data_);
1221 return -1;
1224 int UString::rfind(UChar ch, int pos) const
1226 if (isEmpty())
1227 return -1;
1228 if (pos + 1 >= size())
1229 pos = size() - 1;
1230 const UChar* data_ = data();
1231 for (const UChar* c = data_ + pos; c >= data_; c--) {
1232 if (*c == ch)
1233 return (c - data_);
1236 return -1;
1239 UString UString::substr(int pos, int len) const
1241 int s = size();
1243 if (pos < 0)
1244 pos = 0;
1245 else if (pos >= s)
1246 pos = s;
1247 if (len < 0)
1248 len = s;
1249 if (pos + len >= s)
1250 len = s - pos;
1252 if (pos == 0 && len == s)
1253 return *this;
1255 return UString(Rep::create(m_rep, pos, len));
1258 void UString::copyForWriting()
1260 int l = size();
1261 if (!l) return; // Not going to touch anything anyway.
1262 if (m_rep->rc > 1 || !m_rep->baseIsSelf()) {
1263 UChar* n = allocChars(l);
1264 memcpy(n, data(), l * sizeof(UChar));
1265 m_rep = Rep::create(n, l);
1269 bool operator==(const UString& s1, const UString& s2)
1271 #if 0
1272 if (s1.m_rep == s2.m_rep)
1273 return true;
1274 #endif
1276 if (s1.m_rep->len != s2.m_rep->len)
1277 return false;
1279 return (memcmp(s1.m_rep->data(), s2.m_rep->data(),
1280 s1.m_rep->len * sizeof(UChar)) == 0);
1283 bool operator==(const UString& s1, const char *s2)
1285 if (s2 == 0) {
1286 return s1.isEmpty();
1289 const UChar *u = s1.data();
1290 const UChar *uend = u + s1.size();
1291 while (u != uend && *s2) {
1292 if (u->uc != (unsigned char)*s2)
1293 return false;
1294 s2++;
1295 u++;
1298 return u == uend && *s2 == 0;
1301 bool operator<(const UString& s1, const UString& s2)
1303 const int l1 = s1.size();
1304 const int l2 = s2.size();
1305 const int lmin = l1 < l2 ? l1 : l2;
1306 const UChar *c1 = s1.data();
1307 const UChar *c2 = s2.data();
1308 int l = 0;
1309 while (l < lmin && *c1 == *c2) {
1310 c1++;
1311 c2++;
1312 l++;
1314 if (l < lmin)
1315 return (c1->uc < c2->uc);
1317 return (l1 < l2);
1320 int compare(const UString& s1, const UString& s2)
1322 const int l1 = s1.size();
1323 const int l2 = s2.size();
1324 const int lmin = l1 < l2 ? l1 : l2;
1325 const UChar *c1 = s1.data();
1326 const UChar *c2 = s2.data();
1327 int l = 0;
1328 while (l < lmin && *c1 == *c2) {
1329 c1++;
1330 c2++;
1331 l++;
1334 if (l < lmin)
1335 return (c1->uc > c2->uc) ? 1 : -1;
1337 if (l1 == l2)
1338 return 0;
1340 return (l1 > l2) ? 1 : -1;
1343 inline int inlineUTF8SequenceLengthNonASCII(char b0)
1345 if ((b0 & 0xC0) != 0xC0)
1346 return 0;
1347 if ((b0 & 0xE0) == 0xC0)
1348 return 2;
1349 if ((b0 & 0xF0) == 0xE0)
1350 return 3;
1351 if ((b0 & 0xF8) == 0xF0)
1352 return 4;
1353 return 0;
1356 int UTF8SequenceLengthNonASCII(char b0)
1358 return inlineUTF8SequenceLengthNonASCII(b0);
1361 inline int inlineUTF8SequenceLength(char b0)
1363 return (b0 & 0x80) == 0 ? 1 : UTF8SequenceLengthNonASCII(b0);
1366 // Given a first byte, gives the length of the UTF-8 sequence it begins.
1367 // Returns 0 for bytes that are not legal starts of UTF-8 sequences.
1368 // Only allows sequences of up to 4 bytes, since that works for all Unicode characters (U-00000000 to U-0010FFFF).
1369 int UTF8SequenceLength(char b0)
1371 return (b0 & 0x80) == 0 ? 1 : inlineUTF8SequenceLengthNonASCII(b0);
1374 // Takes a null-terminated C-style string with a UTF-8 sequence in it and converts it to a character.
1375 // Only allows Unicode characters (U-00000000 to U-0010FFFF).
1376 // Returns -1 if the sequence is not valid (including presence of extra bytes).
1377 int decodeUTF8Sequence(const char *sequence)
1379 // Handle 0-byte sequences (never valid).
1380 const unsigned char b0 = sequence[0];
1381 const int length = inlineUTF8SequenceLength(b0);
1382 if (length == 0)
1383 return -1;
1385 // Handle 1-byte sequences (plain ASCII).
1386 const unsigned char b1 = sequence[1];
1387 if (length == 1) {
1388 if (b1)
1389 return -1;
1390 return b0;
1393 // Handle 2-byte sequences.
1394 if ((b1 & 0xC0) != 0x80)
1395 return -1;
1396 const unsigned char b2 = sequence[2];
1397 if (length == 2) {
1398 if (b2)
1399 return -1;
1400 const int c = ((b0 & 0x1F) << 6) | (b1 & 0x3F);
1401 if (c < 0x80)
1402 return -1;
1403 return c;
1406 // Handle 3-byte sequences.
1407 if ((b2 & 0xC0) != 0x80)
1408 return -1;
1409 const unsigned char b3 = sequence[3];
1410 if (length == 3) {
1411 if (b3)
1412 return -1;
1413 const int c = ((b0 & 0xF) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F);
1414 if (c < 0x800)
1415 return -1;
1416 // UTF-16 surrogates should never appear in UTF-8 data.
1417 if (c >= 0xD800 && c <= 0xDFFF)
1418 return -1;
1419 // Backwards BOM and U+FFFF should never appear in UTF-8 data.
1420 if (c == 0xFFFE || c == 0xFFFF)
1421 return -1;
1422 return c;
1425 // Handle 4-byte sequences.
1426 if ((b3 & 0xC0) != 0x80)
1427 return -1;
1428 const unsigned char b4 = sequence[4];
1429 if (length == 4) {
1430 if (b4)
1431 return -1;
1432 const int c = ((b0 & 0x7) << 18) | ((b1 & 0x3F) << 12) | ((b2 & 0x3F) << 6) | (b3 & 0x3F);
1433 if (c < 0x10000 || c > 0x10FFFF)
1434 return -1;
1435 return c;
1438 return -1;
1441 CString UString::UTF8String() const
1443 // Allocate a buffer big enough to hold all the characters.
1444 const int length = size();
1445 Vector<char, 1024> buffer(length * 3);
1447 // Convert to runs of 8-bit characters.
1448 char *p = buffer.begin();
1449 const unsigned short* d = &data()->uc;
1450 for (int i = 0; i != length; ++i) {
1451 unsigned int c = d[i], sc;
1452 if (c < 0x80) {
1453 *p++ = (char)c;
1454 } else if (c < 0x800) {
1455 *p++ = (char)((c >> 6) | 0xC0); // C0 is the 2-byte flag for UTF-8
1456 *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set
1457 } else if (c >= 0xD800 && c <= 0xDBFF && (i+1) < length &&
1458 (sc = d[i+1]) >= 0xDC00 && sc <= 0xDFFF) {
1459 sc = 0x10000 + (((c & 0x3FF) << 10) | (sc & 0x3FF));
1460 *p++ = (char)((sc >> 18) | 0xF0); // F0 is the 4-byte flag for UTF-8
1461 *p++ = (char)(((sc >> 12) | 0x80) & 0xBF); // next 6 bits, with high bit set
1462 *p++ = (char)(((sc >> 6) | 0x80) & 0xBF); // next 6 bits, with high bit set
1463 *p++ = (char)((sc | 0x80) & 0xBF); // next 6 bits, with high bit set
1464 ++i;
1465 } else {
1466 *p++ = (char)((c >> 12) | 0xE0); // E0 is the 3-byte flag for UTF-8
1467 *p++ = (char)(((c >> 6) | 0x80) & 0xBF); // next 6 bits, with high bit set
1468 *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set
1472 // Return the result as a C string.
1473 CString result(buffer.data(), p - buffer.data());
1475 return result;
1478 } // namespace KJS