kjs/ustring.cpp

   1 // -*- c-basic-offset: 2 -*-
   2 /*
   3  *  This file is part of the KDE libraries
   4  *  Copyright (C) 1999-2000 Harri Porten (porten@kde.org)
   5  *  Copyright (C) 2004, 2005, 2006, 2007 Apple Inc. All rights reserved.
   6  *  Copyright (C) 2007 Cameron Zwarich (cwzwarich@uwaterloo.ca)
   7  *
   8  *  This library is free software; you can redistribute it and/or
   9  *  modify it under the terms of the GNU Library General Public
  10  *  License as published by the Free Software Foundation; either
  11  *  version 2 of the License, or (at your option) any later version.
  12  *
  13  *  This library is distributed in the hope that it will be useful,
  14  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16  *  Library General Public License for more details.
  17  *
  18  *  You should have received a copy of the GNU Library General Public License
  19  *  along with this library; see the file COPYING.LIB.  If not, write to
  20  *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
  21  *  Boston, MA 02110-1301, USA.
  22  *
  23  */
  24
  25 #include "ustring.h"
  26 #include <config.h>
  27
  28 #include <assert.h>
  29 #include <stdlib.h>
  30 #include <stdio.h>
  31 #include "wtf/DisallowCType.h"
  32 #include "wtf/ASCIICType.h"
  33 #if HAVE(STRING_H)
  34 #include <string.h>
  35 #endif
  36 #if HAVE(STRINGS_H)
  37 #include <strings.h>
  38 #endif
  39 #include <limits.h>
  40
  41 #include "operations.h"
  42 #include "function.h"
  43 #include "identifier.h"
  44 #include <math.h>
  45 #include "dtoa.h"
  46 #include "collector.h"
  47
  48 #include <wtf/Vector.h>
  49
  50 using std::max;
  51
  52 // GCC cstring uses these automatically, but not all implementations do.
  53 using std::strlen;
  54 using std::strcpy;
  55 using std::strncpy;
  56 using std::memset;
  57 using std::memcpy;
  58
  59 using namespace WTF;
  60
  61 namespace KJS {
  62
  63 extern const double NaN;
  64 extern const double Inf;
  65
  66 static inline size_t overflowIndicator() { return std::numeric_limits<size_t>::max(); }
  67 static inline size_t maxUChars() { return std::numeric_limits<size_t>::max() / sizeof(UChar); }
  68
  69 static inline UChar* allocChars(size_t length)
  70 {
  71     assert(length);
  72     if (length > maxUChars())
  73         return 0;
  74     return static_cast<UChar*>(fastMalloc(sizeof(UChar) * length));
  75 }
  76
  77 static inline UChar* reallocChars(UChar* buffer, size_t length)
  78 {
  79     ASSERT(length);
  80     if (length > maxUChars())
  81         return 0;
  82     return static_cast<UChar*>(fastRealloc(buffer, sizeof(UChar) * length));
  83 }
  84
  85 CString::CString(const char *c)
  86 {
  87   length = strlen(c);
  88   data = new char[length+1];
  89   memcpy(data, c, length + 1);
  90 }
  91
  92 CString::CString(const char *c, size_t len)
  93 {
  94   length = len;
  95   data = new char[len+1];
  96   memcpy(data, c, len);
  97   data[len] = 0;
  98 }
  99
 100 CString::CString(const CString &b)
 101 {
 102   length = b.length;
 103   if (length > 0 && b.data) {
 104     data = new char[length+1];
 105     memcpy(data, b.data, length + 1);
 106   }
 107   else
 108     data = 0;
 109 }
 110
 111 CString::~CString()
 112 {
 113   delete [] data;
 114 }
 115
 116 CString &CString::operator=(const char *c)
 117 {
 118   if (data)
 119     delete [] data;
 120   length = strlen(c);
 121   data = new char[length+1];
 122   memcpy(data, c, length + 1);
 123
 124   return *this;
 125 }
 126
 127 CString &CString::operator=(const CString &str)
 128 {
 129   if (this == &str)
 130     return *this;
 131
 132   if (data)
 133     delete [] data;
 134   length = str.length;
 135   if (str.data) {
 136     data = new char[length + 1];
 137     memcpy(data, str.data, length + 1);
 138   }
 139   else
 140     data = 0;
 141
 142   return *this;
 143 }
 144
 145 bool operator==(const CString& c1, const CString& c2)
 146 {
 147   size_t len = c1.size();
 148   return len == c2.size() && (len == 0 || memcmp(c1.c_str(), c2.c_str(), len) == 0);
 149 }
 150
 151 // Hack here to avoid a global with a constructor; point to an unsigned short instead of a UChar.
 152 static unsigned short almostUChar;
 153 UString::Rep UString::Rep::null = { 0, 0, 1, 0, 0, &UString::Rep::null, 0, 0, 0, 0, 0, 0 };
 154 UString::Rep UString::Rep::empty = { 0, 0, 1, 0, 0, &UString::Rep::empty, 0, reinterpret_cast<UChar*>(&almostUChar), 0, 0, 0, 0 };
 155 const int normalStatBufferSize = 4096;
 156 static char *statBuffer = 0; // FIXME: This buffer is never deallocated.
 157 static int statBufferSize = 0;
 158
 159 PassRefPtr<UString::Rep> UString::Rep::createCopying (const UChar* d, int length)
 160 {
 161   UChar* copyD = allocChars(length);
 162   memcpy(copyD, d, length * sizeof(UChar));
 163
 164   return create(copyD, length);
 165 }
 166
 167 PassRefPtr<UString::Rep> UString::Rep::create(UChar *d, int l)
 168 {
 169   Rep* r = new Rep;
 170   r->offset = 0;
 171   r->len = l;
 172   r->rc = 1;
 173   r->_hash = 0;
 174   r->isIdentifier = 0;
 175   r->baseString = r;
 176   r->reportedCost = 0;
 177   r->buf = d;
 178   r->usedCapacity = l;
 179   r->capacity = l;
 180   r->usedPreCapacity = 0;
 181   r->preCapacity = 0;
 182
 183   // steal the single reference this Rep was created with
 184   return adoptRef(r);
 185 }
 186
 187 PassRefPtr<UString::Rep> UString::Rep::create(PassRefPtr<Rep> base, int offset, int length)
 188 {
 189   assert(base);
 190
 191   int baseOffset = base->offset;
 192
 193   base = base->baseString;
 194
 195   assert(-(offset + baseOffset) <= base->usedPreCapacity);
 196   assert(offset + baseOffset + length <= base->usedCapacity);
 197
 198   Rep* r = new Rep;
 199   r->offset = baseOffset + offset;
 200   r->len = length;
 201   r->rc = 1;
 202   r->_hash = 0;
 203   r->isIdentifier = 0;
 204   r->baseString = base.releaseRef();
 205   r->reportedCost = 0;
 206   r->buf = 0;
 207   r->usedCapacity = 0;
 208   r->capacity = 0;
 209   r->usedPreCapacity = 0;
 210   r->preCapacity = 0;
 211
 212   // steal the single reference this Rep was created with
 213   return adoptRef(r);
 214 }
 215
 216 void UString::Rep::destroy()
 217 {
 218   if (isIdentifier)
 219     Identifier::remove(this);
 220   if (baseString != this) {
 221     baseString->deref();
 222   } else {
 223     fastFree(buf);
 224   }
 225   delete this;
 226 }
 227
 228 // Golden ratio - arbitrary start value to avoid mapping all 0's to all 0's
 229 // or anything like that.
 230 const unsigned PHI = 0x9e3779b9U;
 231
 232 // Paul Hsieh's SuperFastHash
 233 // http://www.azillionmonkeys.com/qed/hash.html
 234 unsigned UString::Rep::computeHash(const UChar *s, int len)
 235 {
 236   unsigned l = len;
 237   uint32_t hash = PHI;
 238   uint32_t tmp;
 239
 240   int rem = l & 1;
 241   l >>= 1;
 242
 243   // Main loop
 244   for (; l > 0; l--) {
 245     hash += s[0].uc;
 246     tmp = (s[1].uc << 11) ^ hash;
 247     hash = (hash << 16) ^ tmp;
 248     s += 2;
 249     hash += hash >> 11;
 250   }
 251
 252   // Handle end case
 253   if (rem) {
 254     hash += s[0].uc;
 255     hash ^= hash << 11;
 256     hash += hash >> 17;
 257   }
 258
 259   // Force "avalanching" of final 127 bits
 260   hash ^= hash << 3;
 261   hash += hash >> 5;
 262   hash ^= hash << 2;
 263   hash += hash >> 15;
 264   hash ^= hash << 10;
 265
 266   // this avoids ever returning a hash code of 0, since that is used to
 267   // signal "hash not computed yet", using a value that is likely to be
 268   // effectively the same as 0 when the low bits are masked
 269   if (hash == 0)
 270     hash = 0x80000000;
 271
 272   return hash;
 273 }
 274
 275 // Paul Hsieh's SuperFastHash
 276 // http://www.azillionmonkeys.com/qed/hash.html
 277 unsigned UString::Rep::computeHash(const char* s, int len)
 278 {
 279   // This hash is designed to work on 16-bit chunks at a time. But since the normal case
 280   // (above) is to hash UTF-16 characters, we just treat the 8-bit chars as if they
 281   // were 16-bit chunks, which should give matching results
 282
 283   uint32_t hash = PHI;
 284   uint32_t tmp;
 285   unsigned l = len;
 286
 287   int rem = l & 1;
 288   l >>= 1;
 289
 290   // Main loop
 291   for (; l > 0; l--) {
 292     hash += (unsigned char)s[0];
 293     tmp = ((unsigned char)s[1] << 11) ^ hash;
 294     hash = (hash << 16) ^ tmp;
 295     s += 2;
 296     hash += hash >> 11;
 297   }
 298
 299   // Handle end case
 300   if (rem) {
 301     hash += (unsigned char)s[0];
 302     hash ^= hash << 11;
 303     hash += hash >> 17;
 304   }
 305
 306   // Force "avalanching" of final 127 bits
 307   hash ^= hash << 3;
 308   hash += hash >> 5;
 309   hash ^= hash << 2;
 310   hash += hash >> 15;
 311   hash ^= hash << 10;
 312
 313   // this avoids ever returning a hash code of 0, since that is used to
 314   // signal "hash not computed yet", using a value that is likely to be
 315   // effectively the same as 0 when the low bits are masked
 316   if (hash == 0)
 317     hash = 0x80000000;
 318
 319   return hash;
 320 }
 321
 322 unsigned UString::Rep::computeHash(const char* s)
 323 {
 324     return computeHash(s, strlen(s));
 325 }
 326
 327 // put these early so they can be inlined
 328 inline size_t UString::expandedSize(size_t size, size_t otherSize) const
 329 {
 330     // Do the size calculation in two parts, returning overflowIndicator if
 331     // we overflow the maximum value that we can handle.
 332
 333     if (size > maxUChars())
 334         return overflowIndicator();
 335
 336     size_t expandedSize = ((size + 10) / 10 * 11) + 1;
 337     if (maxUChars() - expandedSize < otherSize)
 338         return overflowIndicator();
 339
 340     return expandedSize + otherSize;
 341 }
 342
 343 inline int UString::usedCapacity() const
 344 {
 345   return m_rep->baseString->usedCapacity;
 346 }
 347
 348 inline int UString::usedPreCapacity() const
 349 {
 350   return m_rep->baseString->usedPreCapacity;
 351 }
 352
 353 void UString::expandCapacity(int requiredLength)
 354 {
 355   Rep* r = m_rep->baseString;
 356
 357   if (requiredLength > r->capacity) {
 358     size_t newCapacity = expandedSize(requiredLength, r->preCapacity);
 359     UChar* oldBuf = r->buf;
 360     r->buf = reallocChars(r->buf, newCapacity);
 361     if (!r->buf) {
 362         r->buf = oldBuf;
 363         m_rep = &Rep::null;
 364         return;
 365     }
 366     r->capacity = newCapacity - r->preCapacity;
 367   }
 368   if (requiredLength > r->usedCapacity) {
 369     r->usedCapacity = requiredLength;
 370   }
 371 }
 372
 373 void UString::expandPreCapacity(int requiredPreCap)
 374 {
 375   Rep* r = m_rep->baseString;
 376
 377   if (requiredPreCap > r->preCapacity) {
 378     size_t newCapacity = expandedSize(requiredPreCap, r->capacity);
 379     int delta = newCapacity - r->capacity - r->preCapacity;
 380
 381     UChar* newBuf = allocChars(newCapacity);
 382     if (!newBuf) {
 383       m_rep = &Rep::null;
 384       return;
 385     }
 386     memcpy(newBuf + delta, r->buf, (r->capacity + r->preCapacity) * sizeof(UChar));
 387     fastFree(r->buf);
 388     r->buf = newBuf;
 389
 390     r->preCapacity = newCapacity - r->capacity;
 391   }
 392   if (requiredPreCap > r->usedPreCapacity) {
 393     r->usedPreCapacity = requiredPreCap;
 394   }
 395 }
 396
 397
 398 UString::UString(Empty)
 399   : m_rep(&Rep::empty)
 400 {
 401 }
 402
 403 UString::UString(char c)
 404   : m_rep(Rep::create(allocChars(1), 1))
 405 {
 406     m_rep->buf[0] = static_cast<unsigned char>(c);
 407 }
 408
 409 UString::UString(const char* c)
 410 {
 411   if (!c) {
 412     m_rep = &Rep::null;
 413     return;
 414   }
 415
 416   if (!c[0]) {
 417     m_rep = &Rep::empty;
 418     return;
 419   }
 420
 421   size_t length = strlen(c);
 422   UChar *d = allocChars(length);
 423   if (!d)
 424     m_rep = &Rep::null;
 425   else {
 426     for (size_t i = 0; i < length; i++)
 427       d[i].uc = c[i];
 428     m_rep = Rep::create(d, static_cast<int>(length));
 429   }
 430 }
 431
 432 UString::UString(const char* c, size_t length)
 433 {
 434   if (!c) {
 435     m_rep = &Rep::null;
 436     return;
 437   }
 438
 439   if (length == 0) {
 440     m_rep = &Rep::empty;
 441     return;
 442   }
 443
 444   UChar* d = allocChars(length);
 445   if (!d)
 446       m_rep = &Rep::null;
 447   else {
 448     for (size_t i = 0; i < length; i++)
 449       d[i].uc = c[i];
 450     m_rep = Rep::create(d, static_cast<int>(length));
 451   }
 452 }
 453
 454 UString::UString(const UChar* c, int length)
 455 {
 456   if (length == 0)
 457     m_rep = &Rep::empty;
 458   else
 459     m_rep = Rep::createCopying(c, length);
 460 }
 461
 462 UString::UString(UChar* c, int length, bool copy)
 463 {
 464   if (length == 0)
 465     m_rep = &Rep::empty;
 466   else if (copy)
 467     m_rep = Rep::createCopying(c, length);
 468   else
 469     m_rep = Rep::create(c, length);
 470 }
 471
 472 UString::UString(const Vector<UChar>& buffer)
 473 {
 474     if (!buffer.size())
 475         m_rep = &Rep::empty;
 476     else
 477         m_rep = Rep::createCopying(buffer.data(), buffer.size());
 478 }
 479
 480
 481 UString::UString(const UString &a, const UString &b)
 482 {
 483   int aSize = a.size();
 484   int aOffset = a.m_rep->offset;
 485   int bSize = b.size();
 486   int bOffset = b.m_rep->offset;
 487   int length = aSize + bSize;
 488
 489   // possible cases:
 490
 491   if (aSize == 0) {
 492     // a is empty
 493     m_rep = b.m_rep;
 494   } else if (bSize == 0) {
 495     // b is empty
 496     m_rep = a.m_rep;
 497   } else if (aOffset + aSize == a.usedCapacity() && aSize >= minShareSize && 4 * aSize >= bSize &&
 498              (-bOffset != b.usedPreCapacity() || aSize >= bSize)) {
 499     // - a reaches the end of its buffer so it qualifies for shared append
 500     // - also, it's at least a quarter the length of b - appending to a much shorter
 501     //   string does more harm than good
 502     // - however, if b qualifies for prepend and is longer than a, we'd rather prepend
 503     UString x(a);
 504     x.expandCapacity(aOffset + length);
 505     if (a.data() && x.data()) {
 506         memcpy(const_cast<UChar *>(a.data() + aSize), b.data(), bSize * sizeof(UChar));
 507         m_rep = Rep::create(a.m_rep, 0, length);
 508     } else
 509         m_rep = &Rep::null;
 510   } else if (-bOffset == b.usedPreCapacity() && bSize >= minShareSize && 4 * bSize >= aSize) {
 511     // - b reaches the beginning of its buffer so it qualifies for shared prepend
 512     // - also, it's at least a quarter the length of a - prepending to a much shorter
 513     //   string does more harm than good
 514     UString y(b);
 515     y.expandPreCapacity(-bOffset + aSize);
 516     if (b.data() && y.data()) {
 517         memcpy(const_cast<UChar *>(b.data() - aSize), a.data(), aSize * sizeof(UChar));
 518         m_rep = Rep::create(b.m_rep, -aSize, length);
 519     } else
 520         m_rep = &Rep::null;
 521   } else {
 522     // a does not qualify for append, and b does not qualify for prepend, gotta make a whole new string
 523     size_t newCapacity = expandedSize(length, 0);
 524     UChar* d = allocChars(newCapacity);
 525     if (!d)
 526         m_rep = &Rep::null;
 527     else {
 528         memcpy(d, a.data(), aSize * sizeof(UChar));
 529         memcpy(d + aSize, b.data(), bSize * sizeof(UChar));
 530         m_rep = Rep::create(d, length);
 531         m_rep->capacity = newCapacity;
 532     }
 533   }
 534 }
 535
 536 const UString &UString::null()
 537 {
 538   static UString* n = new UString;
 539   return *n;
 540 }
 541
 542 UString UString::from(int i)
 543 {
 544   UChar buf[1 + sizeof(i) * 3];
 545   UChar *end = buf + sizeof(buf) / sizeof(UChar);
 546   UChar *p = end;
 547
 548   if (i == 0) {
 549     *--p = '0';
 550   } else if (i == INT_MIN) {
 551     char minBuf[1 + sizeof(i) * 3];
 552     sprintf(minBuf, "%d", INT_MIN);
 553     return UString(minBuf);
 554   } else {
 555     bool negative = false;
 556     if (i < 0) {
 557       negative = true;
 558       i = -i;
 559     }
 560     while (i) {
 561       *--p = (unsigned short)((i % 10) + '0');
 562       i /= 10;
 563     }
 564     if (negative) {
 565       *--p = '-';
 566     }
 567   }
 568
 569   return UString(p, static_cast<int>(end - p));
 570 }
 571
 572 UString UString::from(unsigned int u)
 573 {
 574   UChar buf[sizeof(u) * 3];
 575   UChar *end = buf + sizeof(buf) / sizeof(UChar);
 576   UChar *p = end;
 577
 578   if (u == 0) {
 579     *--p = '0';
 580   } else {
 581     while (u) {
 582       *--p = (unsigned short)((u % 10) + '0');
 583       u /= 10;
 584     }
 585   }
 586
 587   return UString(p, static_cast<int>(end - p));
 588 }
 589
 590 UString UString::from(long l)
 591 {
 592   UChar buf[1 + sizeof(l) * 3];
 593   UChar *end = buf + sizeof(buf) / sizeof(UChar);
 594   UChar *p = end;
 595
 596   if (l == 0) {
 597     *--p = '0';
 598   } else if (l == LONG_MIN) {
 599     char minBuf[1 + sizeof(l) * 3];
 600     sprintf(minBuf, "%ld", LONG_MIN);
 601     return UString(minBuf);
 602   } else {
 603     bool negative = false;
 604     if (l < 0) {
 605       negative = true;
 606       l = -l;
 607     }
 608     while (l) {
 609       *--p = (unsigned short)((l % 10) + '0');
 610       l /= 10;
 611     }
 612     if (negative) {
 613       *--p = '-';
 614     }
 615   }
 616
 617   return UString(p, static_cast<int>(end - p));
 618 }
 619
 620 UString UString::from(double d)
 621 {
 622   // avoid ever printing -NaN, in JS conceptually there is only one NaN value
 623   if (isNaN(d))
 624     return UString("NaN", 3);
 625
 626   char buf[80];
 627   int decimalPoint;
 628   int sign;
 629
 630   char *result = kjs_dtoa(d, 0, 0, &decimalPoint, &sign, NULL);
 631   int length = static_cast<int>(strlen(result));
 632
 633   int i = 0;
 634   if (sign) {
 635     buf[i++] = '-';
 636   }
 637
 638   if (decimalPoint <= 0 && decimalPoint > -6) {
 639     buf[i++] = '0';
 640     buf[i++] = '.';
 641     for (int j = decimalPoint; j < 0; j++) {
 642       buf[i++] = '0';
 643     }
 644     strcpy(buf + i, result);
 645     i += length;
 646   } else if (decimalPoint <= 21 && decimalPoint > 0) {
 647     if (length <= decimalPoint) {
 648       strcpy(buf + i, result);
 649       i += length;
 650       for (int j = 0; j < decimalPoint - length; j++) {
 651         buf[i++] = '0';
 652       }
 653 //      buf[i] = '\0';
 654     } else {
 655       strncpy(buf + i, result, decimalPoint);
 656       i += decimalPoint;
 657       buf[i++] = '.';
 658       strcpy(buf + i, result + decimalPoint);
 659       i += length - decimalPoint;
 660     }
 661   } else if (result[0] < '0' || result[0] > '9') {
 662     strcpy(buf + i, result);
 663     i += length;
 664   } else {
 665     buf[i++] = result[0];
 666     if (length > 1) {
 667       buf[i++] = '.';
 668       strcpy(buf + i, result + 1);
 669       i += length - 1;
 670     }
 671
 672     buf[i++] = 'e';
 673     buf[i++] = (decimalPoint >= 0) ? '+' : '-';
 674     // decimalPoint can't be more than 3 digits decimal given the
 675     // nature of float representation
 676     int exponential = decimalPoint - 1;
 677     if (exponential < 0) {
 678       exponential = exponential * -1;
 679     }
 680     if (exponential >= 100) {
 681       buf[i++] = '0' + exponential / 100;
 682     }
 683     if (exponential >= 10) {
 684       buf[i++] = '0' + (exponential % 100) / 10;
 685     }
 686     buf[i++] = '0' + exponential % 10;
 687 //    buf[i++] = '\0';
 688   }
 689
 690   kjs_freedtoa(result);
 691
 692   return UString(buf, i);
 693 }
 694
 695 UString UString::spliceSubstringsWithSeparators(const Range *substringRanges, int rangeCount, const UString *separators, int separatorCount) const
 696 {
 697   if (rangeCount == 1 && separatorCount == 0) {
 698     int thisSize = size();
 699     int position = substringRanges[0].position;
 700     int length = substringRanges[0].length;
 701     if (position <= 0 && length >= thisSize)
 702       return *this;
 703     return UString::Rep::create(m_rep, maxInt(0, position), minInt(thisSize, length));
 704   }
 705
 706   int totalLength = 0;
 707   for (int i = 0; i < rangeCount; i++)
 708     totalLength += substringRanges[i].length;
 709   for (int i = 0; i < separatorCount; i++)
 710     totalLength += separators[i].size();
 711
 712   if (totalLength == 0)
 713     return "";
 714
 715   UChar* buffer = allocChars(totalLength);
 716   if (!buffer)
 717       return null();
 718
 719   int maxCount = max(rangeCount, separatorCount);
 720   int bufferPos = 0;
 721   for (int i = 0; i < maxCount; i++) {
 722     if (i < rangeCount) {
 723       memcpy(buffer + bufferPos, data() + substringRanges[i].position, substringRanges[i].length * sizeof(UChar));
 724       bufferPos += substringRanges[i].length;
 725     }
 726     if (i < separatorCount) {
 727       memcpy(buffer + bufferPos, separators[i].data(), separators[i].size() * sizeof(UChar));
 728       bufferPos += separators[i].size();
 729     }
 730   }
 731
 732   return UString::Rep::create(buffer, totalLength);
 733 }
 734
 735 // Append a sub-string of <subStr> to this string.
 736 // Equivalent to append(subStr.substr(subPos, subLength))
 737
 738 UString& UString::append(const UString& subStr, int subPos, int subLength)
 739 {
 740   int subSize = subStr.size();
 741
 742   if (subPos < 0)
 743     subPos = 0;
 744   else if (subPos >= subSize)
 745     subPos = subSize;
 746   if (subLength < 0)
 747     subLength = subSize;
 748   if (subPos + subLength >= subSize)
 749     subLength = subSize - subPos;
 750
 751   return append(UString(subStr.data() + subPos, subLength));
 752 }
 753
 754 UString &UString::append(const UString &t)
 755 {
 756   int thisSize = size();
 757   int thisOffset = m_rep->offset;
 758   int tSize = t.size();
 759   int length = thisSize + tSize;
 760
 761   // possible cases:
 762   if (thisSize == 0) {
 763     // this is empty
 764     *this = t;
 765   } else if (tSize == 0) {
 766     // t is empty
 767   } else if (m_rep->baseIsSelf() && m_rep->rc == 1) {
 768     // this is direct and has refcount of 1 (so we can just alter it directly)
 769     expandCapacity(thisOffset + length);
 770     if (data()) {
 771         memcpy(const_cast<UChar*>(data() + thisSize), t.data(), tSize * sizeof(UChar));
 772         m_rep->len = length;
 773         m_rep->_hash = 0;
 774     }
 775   } else if (thisOffset + thisSize == usedCapacity() && thisSize >= minShareSize) {
 776     // this reaches the end of the buffer - extend it if it's long enough to append to
 777     expandCapacity(thisOffset + length);
 778     if (data()) {
 779         memcpy(const_cast<UChar*>(data() + thisSize), t.data(), tSize * sizeof(UChar));
 780         m_rep = Rep::create(m_rep, 0, length);
 781     }
 782   } else {
 783     // this is shared with someone using more capacity, gotta make a whole new string
 784     size_t newCapacity = expandedSize(length, 0);
 785     UChar* d = allocChars(newCapacity);
 786     if (!d)
 787         m_rep = &Rep::null;
 788     else {
 789         memcpy(d, data(), thisSize * sizeof(UChar));
 790         memcpy(const_cast<UChar*>(d + thisSize), t.data(), tSize * sizeof(UChar));
 791         m_rep = Rep::create(d, length);
 792         m_rep->capacity = newCapacity;
 793     }
 794   }
 795
 796   return *this;
 797 }
 798
 799
 800 UString &UString::append(const char *t)
 801 {
 802   int thisSize = size();
 803   int thisOffset = m_rep->offset;
 804   int tSize = static_cast<int>(strlen(t));
 805   int length = thisSize + tSize;
 806
 807   // possible cases:
 808   if (thisSize == 0) {
 809     // this is empty
 810     *this = t;
 811   } else if (tSize == 0) {
 812     // t is empty, we'll just return *this below.
 813   } else if (m_rep->baseIsSelf() && m_rep->rc == 1) {
 814     // this is direct and has refcount of 1 (so we can just alter it directly)
 815     expandCapacity(thisOffset + length);
 816     UChar *d = const_cast<UChar *>(data());
 817     if (d) {
 818         for (int i = 0; i < tSize; ++i)
 819             d[thisSize + i] = t[i];
 820         m_rep->len = length;
 821         m_rep->_hash = 0;
 822     }
 823   } else if (thisOffset + thisSize == usedCapacity() && thisSize >= minShareSize) {
 824     // this string reaches the end of the buffer - extend it
 825     expandCapacity(thisOffset + length);
 826     UChar *d = const_cast<UChar *>(data());
 827     if (d) {
 828         for (int i = 0; i < tSize; ++i)
 829             d[thisSize + i] = t[i];
 830         m_rep = Rep::create(m_rep, 0, length);
 831     }
 832   } else {
 833     // this is shared with someone using more capacity, gotta make a whole new string
 834     size_t newCapacity = expandedSize(length, 0);
 835     UChar* d = allocChars(newCapacity);
 836     if (!d)
 837         m_rep = &Rep::null;
 838     else {
 839         memcpy(d, data(), thisSize * sizeof(UChar));
 840         for (int i = 0; i < tSize; ++i)
 841             d[thisSize + i] = t[i];
 842         m_rep = Rep::create(d, length);
 843         m_rep->capacity = newCapacity;
 844     }
 845   }
 846
 847   return *this;
 848 }
 849
 850 UString &UString::append(unsigned short c)
 851 {
 852   int thisOffset = m_rep->offset;
 853   int length = size();
 854
 855   // possible cases:
 856   if (length == 0) {
 857     // this is empty - must make a new m_rep because we don't want to pollute the shared empty one
 858     size_t newCapacity = expandedSize(1, 0);
 859     UChar* d = allocChars(newCapacity);
 860     if (!d)
 861         m_rep = &Rep::null;
 862     else {
 863         d[0] = c;
 864         m_rep = Rep::create(d, 1);
 865         m_rep->capacity = newCapacity;
 866     }
 867   } else if (m_rep->baseIsSelf() && m_rep->rc == 1) {
 868     // this is direct and has refcount of 1 (so we can just alter it directly)
 869     expandCapacity(thisOffset + length + 1);
 870     UChar *d = const_cast<UChar *>(data());
 871     if (d) {
 872         d[length] = c;
 873         m_rep->len = length + 1;
 874         m_rep->_hash = 0;
 875     }
 876   } else if (thisOffset + length == usedCapacity() && length >= minShareSize) {
 877     // this reaches the end of the string - extend it and share
 878     expandCapacity(thisOffset + length + 1);
 879     UChar *d = const_cast<UChar *>(data());
 880     if (d) {
 881         d[length] = c;
 882         m_rep = Rep::create(m_rep, 0, length + 1);
 883     }
 884   } else {
 885     // this is shared with someone using more capacity, gotta make a whole new string
 886     size_t newCapacity = expandedSize(length + 1, 0);
 887     UChar* d = allocChars(newCapacity);
 888     if (!d)
 889         m_rep = &Rep::null;
 890     else {
 891         memcpy(d, data(), length * sizeof(UChar));
 892         d[length] = c;
 893         m_rep = Rep::create(d, length + 1);
 894         m_rep->capacity = newCapacity;
 895     }
 896   }
 897
 898   return *this;
 899 }
 900
 901 CString UString::cstring() const
 902 {
 903   return ascii();
 904 }
 905
 906 char *UString::ascii() const
 907 {
 908   // Never make the buffer smaller than normalStatBufferSize.
 909   // Thus we almost never need to reallocate.
 910   int length = size();
 911   int neededSize = length + 1;
 912   if (neededSize < normalStatBufferSize) {
 913     neededSize = normalStatBufferSize;
 914   }
 915   if (neededSize != statBufferSize) {
 916     delete [] statBuffer;
 917     statBuffer = new char [neededSize];
 918     statBufferSize = neededSize;
 919   }
 920
 921   const UChar *p = data();
 922   char *q = statBuffer;
 923   const UChar *limit = p + length;
 924   while (p != limit) {
 925     *q = static_cast<char>(p->uc);
 926     ++p;
 927     ++q;
 928   }
 929   *q = '\0';
 930
 931   return statBuffer;
 932 }
 933
 934 UString& UString::operator=(Empty)
 935 {
 936   m_rep = &Rep::empty;
 937
 938   return *this;
 939 }
 940
 941 UString& UString::operator=(const char* c)
 942 {
 943   set(c, c ? strlen(c) : 0);
 944
 945   return *this;
 946 }
 947
 948 void UString::set(const char* c, int l)
 949 {
 950   if (!c) {
 951     m_rep = &Rep::null;
 952     return;
 953   }
 954
 955   if (l == 0) {
 956     m_rep = &Rep::empty;
 957     return;
 958   }
 959
 960   UChar *d;
 961   if (m_rep->rc == 1 && l <= m_rep->capacity && m_rep->baseIsSelf() && m_rep->offset == 0 && m_rep->preCapacity == 0) {
 962     d = m_rep->buf;
 963     m_rep->_hash = 0;
 964     m_rep->len = l;
 965   } else {
 966     d = allocChars(l);
 967     if (!d) {
 968       m_rep = &Rep::null;
 969       return;
 970     }
 971     m_rep = Rep::create(d, l);
 972   }
 973   for (int i = 0; i < l; i++)
 974     d[i].uc = static_cast<unsigned char>(c[i]);
 975 }
 976
 977 bool UString::is8Bit() const
 978 {
 979   const UChar *u = data();
 980   const UChar *limit = u + size();
 981   while (u < limit) {
 982     if (u->uc > 0xFF)
 983       return false;
 984     ++u;
 985   }
 986
 987   return true;
 988 }
 989
 990 const UChar UString::operator[](int pos) const
 991 {
 992   if (pos >= size())
 993     return '\0';
 994   return data()[pos];
 995 }
 996
 997 double UString::toDouble(bool tolerateTrailingJunk, bool tolerateEmptyString) const
 998 {
 999   double d;
1000
1001   // FIXME: If tolerateTrailingJunk is true, then we want to tolerate non-8-bit junk
1002   // after the number, so is8Bit is too strict a check.
1003   if (!is8Bit())
1004     return NaN;
1005
1006   const char *c = ascii();
1007
1008   // skip leading white space
1009   while (isASCIISpace(*c))
1010     c++;
1011
1012   // empty string ?
1013   if (*c == '\0')
1014     return tolerateEmptyString ? 0.0 : NaN;
1015
1016   // hex number ?
1017   if (*c == '0' && (*(c+1) == 'x' || *(c+1) == 'X')) {
1018     const char* firstDigitPosition = c + 2;
1019     c++;
1020     d = 0.0;
1021     while (*(++c)) {
1022       if (*c >= '0' && *c <= '9')
1023         d = d * 16.0 + *c - '0';
1024       else if ((*c >= 'A' && *c <= 'F') || (*c >= 'a' && *c <= 'f'))
1025         d = d * 16.0 + (*c & 0xdf) - 'A' + 10.0;
1026       else
1027         break;
1028     }
1029
1030     if (d >= mantissaOverflowLowerBound)
1031         d = parseIntOverflow(firstDigitPosition, c - firstDigitPosition, 16);
1032   } else {
1033     // regular number ?
1034     char *end;
1035     d = kjs_strtod(c, &end);
1036     if ((d != 0.0 || end != c) && d != Inf && d != -Inf) {
1037       c = end;
1038     } else {
1039       double sign = 1.0;
1040
1041       if (*c == '+')
1042         c++;
1043       else if (*c == '-') {
1044         sign = -1.0;
1045         c++;
1046       }
1047
1048       // We used strtod() to do the conversion. However, strtod() handles
1049       // infinite values slightly differently than JavaScript in that it
1050       // converts the string "inf" with any capitalization to infinity,
1051       // whereas the ECMA spec requires that it be converted to NaN.
1052
1053       if (strncmp(c, "Infinity", 8) == 0) {
1054         d = sign * Inf;
1055         c += 8;
1056       } else if ((d == Inf || d == -Inf) && *c != 'I' && *c != 'i')
1057         c = end;
1058       else
1059         return NaN;
1060     }
1061   }
1062
1063   // allow trailing white space
1064   while (isASCIISpace(*c))
1065     c++;
1066   // don't allow anything after - unless tolerant=true
1067   if (!tolerateTrailingJunk && *c != '\0')
1068     d = NaN;
1069
1070   return d;
1071 }
1072
1073 double UString::toDouble(bool tolerateTrailingJunk) const
1074 {
1075   return toDouble(tolerateTrailingJunk, true);
1076 }
1077
1078 double UString::toDouble() const
1079 {
1080   return toDouble(false, true);
1081 }
1082
1083 uint32_t UString::toUInt32(bool *ok) const
1084 {
1085   double d = toDouble();
1086   bool b = true;
1087
1088   if (d != static_cast<uint32_t>(d)) {
1089     b = false;
1090     d = 0;
1091   }
1092
1093   if (ok)
1094     *ok = b;
1095
1096   return static_cast<uint32_t>(d);
1097 }
1098
1099 uint32_t UString::toUInt32(bool *ok, bool tolerateEmptyString) const
1100 {
1101   double d = toDouble(false, tolerateEmptyString);
1102   bool b = true;
1103
1104   if (d != static_cast<uint32_t>(d)) {
1105     b = false;
1106     d = 0;
1107   }
1108
1109   if (ok)
1110     *ok = b;
1111
1112   return static_cast<uint32_t>(d);
1113 }
1114
1115 uint32_t UString::toStrictUInt32(bool *ok) const
1116 {
1117   if (ok)
1118     *ok = false;
1119
1120   // Empty string is not OK.
1121   int len = m_rep->len;
1122   if (len == 0)
1123     return 0;
1124   const UChar *p = m_rep->data();
1125   unsigned short c = p->unicode();
1126
1127   // If the first digit is 0, only 0 itself is OK.
1128   if (c == '0') {
1129     if (len == 1 && ok)
1130       *ok = true;
1131     return 0;
1132   }
1133
1134   // Convert to UInt32, checking for overflow.
1135   uint32_t i = 0;
1136   while (1) {
1137     // Process character, turning it into a digit.
1138     if (c < '0' || c > '9')
1139       return 0;
1140     const unsigned d = c - '0';
1141
1142     // Multiply by 10, checking for overflow out of 32 bits.
1143     if (i > 0xFFFFFFFFU / 10)
1144       return 0;
1145     i *= 10;
1146
1147     // Add in the digit, checking for overflow out of 32 bits.
1148     const unsigned max = 0xFFFFFFFFU - d;
1149     if (i > max)
1150         return 0;
1151     i += d;
1152
1153     // Handle end of string.
1154     if (--len == 0) {
1155       if (ok)
1156         *ok = true;
1157       return i;
1158     }
1159
1160     // Get next character.
1161     c = (++p)->unicode();
1162   }
1163 }
1164
1165 int UString::find(const UString &f, int pos) const
1166 {
1167   int sz = size();
1168   int fsz = f.size();
1169   if (sz < fsz)
1170     return -1;
1171   if (pos < 0)
1172     pos = 0;
1173   if (fsz == 0)
1174     return pos;
1175   const UChar* data_ = data();
1176   const UChar* end = data_ + sz - fsz;
1177   int fsizeminusone = (fsz - 1) * sizeof(UChar);
1178   const UChar *fdata = f.data();
1179   unsigned short fchar = fdata->uc;
1180   ++fdata;
1181   for (const UChar* c = data_ + pos; c <= end; c++)
1182     if (c->uc == fchar && !memcmp(c + 1, fdata, fsizeminusone))
1183       return (c - data_);
1184
1185   return -1;
1186 }
1187
1188 int UString::find(UChar ch, int pos) const
1189 {
1190   if (pos < 0)
1191     pos = 0;
1192   const UChar* data_ = data();
1193   const UChar *end = data_ + size();
1194   for (const UChar *c = data_ + pos; c < end; c++)
1195     if (*c == ch)
1196       return (c - data_);
1197
1198   return -1;
1199 }
1200
1201 int UString::rfind(const UString &f, int pos) const
1202 {
1203   int sz = size();
1204   int fsz = f.size();
1205   if (sz < fsz)
1206     return -1;
1207   if (pos < 0)
1208     pos = 0;
1209   if (pos > sz - fsz)
1210     pos = sz - fsz;
1211   if (fsz == 0)
1212     return pos;
1213   int fsizeminusone = (fsz - 1) * sizeof(UChar);
1214   const UChar *fdata = f.data();
1215   const UChar* data_ = data();
1216   for (const UChar* c = data_ + pos; c >= data_; c--) {
1217     if (*c == *fdata && !memcmp(c + 1, fdata + 1, fsizeminusone))
1218       return (c - data_);
1219   }
1220
1221   return -1;
1222 }
1223
1224 int UString::rfind(UChar ch, int pos) const
1225 {
1226   if (isEmpty())
1227     return -1;
1228   if (pos + 1 >= size())
1229     pos = size() - 1;
1230   const UChar* data_ = data();
1231   for (const UChar* c = data_ + pos; c >= data_; c--) {
1232     if (*c == ch)
1233       return (c - data_);
1234   }
1235
1236   return -1;
1237 }
1238
1239 UString UString::substr(int pos, int len) const
1240 {
1241   int s = size();
1242
1243   if (pos < 0)
1244     pos = 0;
1245   else if (pos >= s)
1246     pos = s;
1247   if (len < 0)
1248     len = s;
1249   if (pos + len >= s)
1250     len = s - pos;
1251
1252   if (pos == 0 && len == s)
1253     return *this;
1254
1255   return UString(Rep::create(m_rep, pos, len));
1256 }
1257
1258 void UString::copyForWriting()
1259 {
1260   int l = size();
1261   if (!l) return; // Not going to touch anything anyway.
1262   if (m_rep->rc > 1 || !m_rep->baseIsSelf()) {
1263     UChar* n = allocChars(l);
1264     memcpy(n, data(), l * sizeof(UChar));
1265     m_rep = Rep::create(n, l);
1266   }
1267 }
1268
1269 bool operator==(const UString& s1, const UString& s2)
1270 {
1271 #if 0
1272   if (s1.m_rep == s2.m_rep)
1273     return true;
1274 #endif
1275
1276   if (s1.m_rep->len != s2.m_rep->len)
1277     return false;
1278
1279   return (memcmp(s1.m_rep->data(), s2.m_rep->data(),
1280                  s1.m_rep->len * sizeof(UChar)) == 0);
1281 }
1282
1283 bool operator==(const UString& s1, const char *s2)
1284 {
1285   if (s2 == 0) {
1286     return s1.isEmpty();
1287   }
1288
1289   const UChar *u = s1.data();
1290   const UChar *uend = u + s1.size();
1291   while (u != uend && *s2) {
1292     if (u->uc != (unsigned char)*s2)
1293       return false;
1294     s2++;
1295     u++;
1296   }
1297
1298   return u == uend && *s2 == 0;
1299 }
1300
1301 bool operator<(const UString& s1, const UString& s2)
1302 {
1303   const int l1 = s1.size();
1304   const int l2 = s2.size();
1305   const int lmin = l1 < l2 ? l1 : l2;
1306   const UChar *c1 = s1.data();
1307   const UChar *c2 = s2.data();
1308   int l = 0;
1309   while (l < lmin && *c1 == *c2) {
1310     c1++;
1311     c2++;
1312     l++;
1313   }
1314   if (l < lmin)
1315     return (c1->uc < c2->uc);
1316
1317   return (l1 < l2);
1318 }
1319
1320 int compare(const UString& s1, const UString& s2)
1321 {
1322   const int l1 = s1.size();
1323   const int l2 = s2.size();
1324   const int lmin = l1 < l2 ? l1 : l2;
1325   const UChar *c1 = s1.data();
1326   const UChar *c2 = s2.data();
1327   int l = 0;
1328   while (l < lmin && *c1 == *c2) {
1329     c1++;
1330     c2++;
1331     l++;
1332   }
1333
1334   if (l < lmin)
1335     return (c1->uc > c2->uc) ? 1 : -1;
1336
1337   if (l1 == l2)
1338     return 0;
1339
1340   return (l1 > l2) ? 1 : -1;
1341 }
1342
1343 inline int inlineUTF8SequenceLengthNonASCII(char b0)
1344 {
1345   if ((b0 & 0xC0) != 0xC0)
1346     return 0;
1347   if ((b0 & 0xE0) == 0xC0)
1348     return 2;
1349   if ((b0 & 0xF0) == 0xE0)
1350     return 3;
1351   if ((b0 & 0xF8) == 0xF0)
1352     return 4;
1353   return 0;
1354 }
1355
1356 int UTF8SequenceLengthNonASCII(char b0)
1357 {
1358   return inlineUTF8SequenceLengthNonASCII(b0);
1359 }
1360
1361 inline int inlineUTF8SequenceLength(char b0)
1362 {
1363   return (b0 & 0x80) == 0 ? 1 : UTF8SequenceLengthNonASCII(b0);
1364 }
1365
1366 // Given a first byte, gives the length of the UTF-8 sequence it begins.
1367 // Returns 0 for bytes that are not legal starts of UTF-8 sequences.
1368 // Only allows sequences of up to 4 bytes, since that works for all Unicode characters (U-00000000 to U-0010FFFF).
1369 int UTF8SequenceLength(char b0)
1370 {
1371   return (b0 & 0x80) == 0 ? 1 : inlineUTF8SequenceLengthNonASCII(b0);
1372 }
1373
1374 // Takes a null-terminated C-style string with a UTF-8 sequence in it and converts it to a character.
1375 // Only allows Unicode characters (U-00000000 to U-0010FFFF).
1376 // Returns -1 if the sequence is not valid (including presence of extra bytes).
1377 int decodeUTF8Sequence(const char *sequence)
1378 {
1379   // Handle 0-byte sequences (never valid).
1380   const unsigned char b0 = sequence[0];
1381   const int length = inlineUTF8SequenceLength(b0);
1382   if (length == 0)
1383     return -1;
1384
1385   // Handle 1-byte sequences (plain ASCII).
1386   const unsigned char b1 = sequence[1];
1387   if (length == 1) {
1388     if (b1)
1389       return -1;
1390     return b0;
1391   }
1392
1393   // Handle 2-byte sequences.
1394   if ((b1 & 0xC0) != 0x80)
1395     return -1;
1396   const unsigned char b2 = sequence[2];
1397   if (length == 2) {
1398     if (b2)
1399       return -1;
1400     const int c = ((b0 & 0x1F) << 6) | (b1 & 0x3F);
1401     if (c < 0x80)
1402       return -1;
1403     return c;
1404   }
1405
1406   // Handle 3-byte sequences.
1407   if ((b2 & 0xC0) != 0x80)
1408     return -1;
1409   const unsigned char b3 = sequence[3];
1410   if (length == 3) {
1411     if (b3)
1412       return -1;
1413     const int c = ((b0 & 0xF) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F);
1414     if (c < 0x800)
1415       return -1;
1416     // UTF-16 surrogates should never appear in UTF-8 data.
1417     if (c >= 0xD800 && c <= 0xDFFF)
1418       return -1;
1419     // Backwards BOM and U+FFFF should never appear in UTF-8 data.
1420     if (c == 0xFFFE || c == 0xFFFF)
1421       return -1;
1422     return c;
1423   }
1424
1425   // Handle 4-byte sequences.
1426   if ((b3 & 0xC0) != 0x80)
1427     return -1;
1428   const unsigned char b4 = sequence[4];
1429   if (length == 4) {
1430     if (b4)
1431       return -1;
1432     const int c = ((b0 & 0x7) << 18) | ((b1 & 0x3F) << 12) | ((b2 & 0x3F) << 6) | (b3 & 0x3F);
1433     if (c < 0x10000 || c > 0x10FFFF)
1434       return -1;
1435     return c;
1436   }
1437
1438   return -1;
1439 }
1440
1441 CString UString::UTF8String() const
1442 {
1443   // Allocate a buffer big enough to hold all the characters.
1444   const int length = size();
1445   Vector<char, 1024> buffer(length * 3);
1446
1447   // Convert to runs of 8-bit characters.
1448   char *p = buffer.begin();
1449   const unsigned short* d = &data()->uc;
1450   for (int i = 0; i != length; ++i) {
1451     unsigned int c = d[i], sc;
1452     if (c < 0x80) {
1453       *p++ = (char)c;
1454     } else if (c < 0x800) {
1455       *p++ = (char)((c >> 6) | 0xC0); // C0 is the 2-byte flag for UTF-8
1456       *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set
1457     } else if (c >= 0xD800 && c <= 0xDBFF && (i+1) < length &&
1458                    (sc = d[i+1]) >= 0xDC00 && sc <= 0xDFFF) {
1459       sc = 0x10000 + (((c & 0x3FF) << 10) | (sc & 0x3FF));
1460       *p++ = (char)((sc >> 18) | 0xF0); // F0 is the 4-byte flag for UTF-8
1461       *p++ = (char)(((sc >> 12) | 0x80) & 0xBF); // next 6 bits, with high bit set
1462       *p++ = (char)(((sc >> 6) | 0x80) & 0xBF); // next 6 bits, with high bit set
1463       *p++ = (char)((sc | 0x80) & 0xBF); // next 6 bits, with high bit set
1464       ++i;
1465     } else {
1466       *p++ = (char)((c >> 12) | 0xE0); // E0 is the 3-byte flag for UTF-8
1467       *p++ = (char)(((c >> 6) | 0x80) & 0xBF); // next 6 bits, with high bit set
1468       *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set
1469     }
1470   }
1471
1472   // Return the result as a C string.
1473   CString result(buffer.data(), p - buffer.data());
1474
1475   return result;
1476 }
1477
1478 } // namespace KJS