string.c

   1 /**********************************************************************
   2
   3   string.c -
   4
   5   $Author$
   6   created at: Mon Aug  9 17:12:58 JST 1993
   7
   8   Copyright (C) 1993-2007 Yukihiro Matsumoto
   9   Copyright (C) 2000  Network Applied Communication Laboratory, Inc.
  10   Copyright (C) 2000  Information-technology Promotion Agency, Japan
  11
  12 **********************************************************************/
  13
  14 #include "ruby/ruby.h"
  15 #include "ruby/re.h"
  16 #include "ruby/encoding.h"
  17
  18 #define BEG(no) regs->beg[no]
  19 #define END(no) regs->end[no]
  20
  21 #include <math.h>
  22 #include <ctype.h>
  23
  24 #ifdef HAVE_UNISTD_H
  25 #include <unistd.h>
  26 #endif
  27
  28 #undef rb_str_new2
  29 #undef rb_tainted_str_new2
  30 #undef rb_usascii_str_new2
  31 #undef rb_str_buf_new2
  32 #undef rb_str_buf_cat2
  33 #undef rb_str_cat2
  34
  35 VALUE rb_cString;
  36 VALUE rb_cSymbol;
  37
  38 #define STR_TMPLOCK FL_USER7
  39 #define STR_NOEMBED FL_USER1
  40 #define STR_SHARED  FL_USER2 /* = ELTS_SHARED */
  41 #define STR_ASSOC   FL_USER3
  42 #define STR_SHARED_P(s) FL_ALL(s, STR_NOEMBED|ELTS_SHARED)
  43 #define STR_ASSOC_P(s)  FL_ALL(s, STR_NOEMBED|STR_ASSOC)
  44 #define STR_NOCAPA  (STR_NOEMBED|ELTS_SHARED|STR_ASSOC)
  45 #define STR_NOCAPA_P(s) (FL_TEST(s,STR_NOEMBED) && FL_ANY(s,ELTS_SHARED|STR_ASSOC))
  46 #define STR_UNSET_NOCAPA(s) do {\
  47     if (FL_TEST(s,STR_NOEMBED)) FL_UNSET(s,(ELTS_SHARED|STR_ASSOC));\
  48 } while (0)
  49
  50
  51 #define STR_SET_NOEMBED(str) do {\
  52     FL_SET(str, STR_NOEMBED);\
  53     STR_SET_EMBED_LEN(str, 0);\
  54 } while (0)
  55 #define STR_SET_EMBED(str) FL_UNSET(str, STR_NOEMBED)
  56 #define STR_EMBED_P(str) (!FL_TEST(str, STR_NOEMBED))
  57 #define STR_SET_EMBED_LEN(str, n) do { \
  58     long tmp_n = (n);\
  59     RBASIC(str)->flags &= ~RSTRING_EMBED_LEN_MASK;\
  60     RBASIC(str)->flags |= (tmp_n) << RSTRING_EMBED_LEN_SHIFT;\
  61 } while (0)
  62
  63 #define STR_SET_LEN(str, n) do { \
  64     if (STR_EMBED_P(str)) {\
  65         STR_SET_EMBED_LEN(str, n);\
  66     }\
  67     else {\
  68         RSTRING(str)->as.heap.len = (n);\
  69     }\
  70 } while (0)
  71
  72 #define STR_DEC_LEN(str) do {\
  73     if (STR_EMBED_P(str)) {\
  74         long n = RSTRING_LEN(str);\
  75         n--;\
  76         STR_SET_EMBED_LEN(str, n);\
  77     }\
  78     else {\
  79         RSTRING(str)->as.heap.len--;\
  80     }\
  81 } while (0)
  82
  83 #define RESIZE_CAPA(str,capacity) do {\
  84     if (STR_EMBED_P(str)) {\
  85         if ((capacity) > RSTRING_EMBED_LEN_MAX) {\
  86             char *tmp = ALLOC_N(char, capacity+1);\
  87             memcpy(tmp, RSTRING_PTR(str), RSTRING_LEN(str));\
  88             RSTRING(str)->as.heap.ptr = tmp;\
  89             RSTRING(str)->as.heap.len = RSTRING_LEN(str);\
  90             STR_SET_NOEMBED(str);\
  91             RSTRING(str)->as.heap.aux.capa = (capacity);\
  92         }\
  93     }\
  94     else {\
  95         REALLOC_N(RSTRING(str)->as.heap.ptr, char, (capacity)+1);\
  96         if (!STR_NOCAPA_P(str))\
  97             RSTRING(str)->as.heap.aux.capa = (capacity);\
  98     }\
  99 } while (0)
 100
 101 #define is_ascii_string(str) (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT)
 102 #define is_broken_string(str) (rb_enc_str_coderange(str) == ENC_CODERANGE_BROKEN)
 103
 104 #define STR_ENC_GET(str) rb_enc_from_index(ENCODING_GET(str))
 105
 106 static int
 107 single_byte_optimizable(VALUE str)
 108 {
 109     rb_encoding *enc = STR_ENC_GET(str);
 110
 111     if (rb_enc_mbmaxlen(enc) == 1)
 112         return 1;
 113
 114     /* Conservative.  It may be ENC_CODERANGE_UNKNOWN. */
 115     if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT)
 116         return 1;
 117
 118     /* Conservative.  Possibly single byte.
 119      * "\xa1" in Shift_JIS for example. */
 120     return 0;
 121 }
 122
 123 VALUE rb_fs;
 124
 125 static inline const char *
 126 search_nonascii(const char *p, const char *e)
 127 {
 128 #if SIZEOF_VALUE == 8
 129 # define NONASCII_MASK 0x8080808080808080LL
 130 #elif SIZEOF_VALUE == 4
 131 # define NONASCII_MASK 0x80808080UL
 132 #endif
 133 #ifdef NONASCII_MASK
 134     if (sizeof(VALUE) * 2 < e - p) {
 135         const VALUE *s, *t;
 136         const VALUE lowbits = sizeof(VALUE) - 1;
 137         s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
 138         while (p < (const char *)s) {
 139             if (!ISASCII(*p))
 140                 return p;
 141             p++;
 142         }
 143         t = (const VALUE*)(~lowbits & (VALUE)e);
 144         while (s < t) {
 145             if (*s & NONASCII_MASK) {
 146                 t = s;
 147                 break;
 148             }
 149             s++;
 150         }
 151         p = (const char *)t;
 152     }
 153 #endif
 154     while (p < e) {
 155         if (!ISASCII(*p))
 156             return p;
 157         p++;
 158     }
 159     return NULL;
 160 }
 161
 162 static int
 163 coderange_scan(const char *p, long len, rb_encoding *enc)
 164 {
 165     const char *e = p + len;
 166
 167     if (rb_enc_to_index(enc) == 0) {
 168         /* enc is ASCII-8BIT.  ASCII-8BIT string never be broken. */
 169         p = search_nonascii(p, e);
 170         return p ? ENC_CODERANGE_VALID : ENC_CODERANGE_7BIT;
 171     }
 172
 173     if (rb_enc_asciicompat(enc)) {
 174         p = search_nonascii(p, e);
 175         if (!p) {
 176             return ENC_CODERANGE_7BIT;
 177         }
 178         while (p < e) {
 179             int ret = rb_enc_precise_mbclen(p, e, enc);
 180             if (!MBCLEN_CHARFOUND_P(ret)) {
 181                 return ENC_CODERANGE_BROKEN;
 182             }
 183             p += MBCLEN_CHARFOUND_LEN(ret);
 184             if (p < e) {
 185                 p = search_nonascii(p, e);
 186                 if (!p) {
 187                     return ENC_CODERANGE_VALID;
 188                 }
 189             }
 190         }
 191         if (e < p) {
 192             return ENC_CODERANGE_BROKEN;
 193         }
 194         return ENC_CODERANGE_VALID;
 195     }
 196
 197     while (p < e) {
 198         int ret = rb_enc_precise_mbclen(p, e, enc);
 199
 200         if (!MBCLEN_CHARFOUND_P(ret)) {
 201             return ENC_CODERANGE_BROKEN;
 202         }
 203         p += MBCLEN_CHARFOUND_LEN(ret);
 204     }
 205     if (e < p) {
 206         return ENC_CODERANGE_BROKEN;
 207     }
 208     return ENC_CODERANGE_VALID;
 209 }
 210
 211 long
 212 rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
 213 {
 214     const char *p = s;
 215
 216     if (*cr == ENC_CODERANGE_BROKEN)
 217         return e - s;
 218
 219     if (rb_enc_to_index(enc) == 0) {
 220         /* enc is ASCII-8BIT.  ASCII-8BIT string never be broken. */
 221         p = search_nonascii(p, e);
 222         *cr = (!p && *cr != ENC_CODERANGE_VALID) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
 223         return e - s;
 224     }
 225     else if (rb_enc_asciicompat(enc)) {
 226         p = search_nonascii(p, e);
 227         if (!p) {
 228             if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
 229             return e - s;
 230         }
 231         while (p < e) {
 232             int ret = rb_enc_precise_mbclen(p, e, enc);
 233             if (!MBCLEN_CHARFOUND_P(ret)) {
 234                 *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN;
 235                 return p - s;
 236             }
 237             p += MBCLEN_CHARFOUND_LEN(ret);
 238             if (p < e) {
 239                 p = search_nonascii(p, e);
 240                 if (!p) {
 241                     *cr = ENC_CODERANGE_VALID;
 242                     return e - s;
 243                 }
 244             }
 245         }
 246         *cr = e < p ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_VALID;
 247         return p - s;
 248     }
 249     else {
 250         while (p < e) {
 251             int ret = rb_enc_precise_mbclen(p, e, enc);
 252             if (!MBCLEN_CHARFOUND_P(ret)) {
 253                 *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN;
 254                 return p - s;
 255             }
 256             p += MBCLEN_CHARFOUND_LEN(ret);
 257         }
 258         *cr = e < p ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_VALID;
 259         return p - s;
 260     }
 261 }
 262
 263 static inline void
 264 str_enc_copy(VALUE str1, VALUE str2)
 265 {
 266     rb_enc_set_index(str1, ENCODING_GET(str2));
 267 }
 268
 269 static void
 270 rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
 271 {
 272     /* this function is designed for copying encoding and coderange
 273      * from src to new string "dest" which is made from the part of src.
 274      */
 275     str_enc_copy(dest, src);
 276     switch (ENC_CODERANGE(src)) {
 277       case ENC_CODERANGE_7BIT:
 278         ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
 279         break;
 280       case ENC_CODERANGE_VALID:
 281         if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
 282             search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
 283             ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID);
 284         else
 285             ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
 286         break;
 287       default:
 288         if (RSTRING_LEN(dest) == 0) {
 289             if (!rb_enc_asciicompat(STR_ENC_GET(src)))
 290                 ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID);
 291             else
 292                 ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
 293         }
 294         break;
 295     }
 296 }
 297
 298 static void
 299 rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
 300 {
 301     str_enc_copy(dest, src);
 302     ENC_CODERANGE_SET(dest, ENC_CODERANGE(src));
 303 }
 304
 305 int
 306 rb_enc_str_coderange(VALUE str)
 307 {
 308     int cr = ENC_CODERANGE(str);
 309
 310     if (cr == ENC_CODERANGE_UNKNOWN) {
 311         rb_encoding *enc = STR_ENC_GET(str);
 312         cr = coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
 313         ENC_CODERANGE_SET(str, cr);
 314     }
 315     return cr;
 316 }
 317
 318 int
 319 rb_enc_str_asciionly_p(VALUE str)
 320 {
 321     rb_encoding *enc = STR_ENC_GET(str);
 322
 323     if (!rb_enc_asciicompat(enc))
 324         return Qfalse;
 325     else if (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT)
 326         return Qtrue;
 327     return Qfalse;
 328 }
 329
 330 static inline void
 331 str_mod_check(VALUE s, const char *p, long len)
 332 {
 333     if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
 334         rb_raise(rb_eRuntimeError, "string modified");
 335     }
 336 }
 337
 338 static inline void
 339 str_frozen_check(VALUE s)
 340 {
 341     if (OBJ_FROZEN(s)) {
 342         rb_raise(rb_eRuntimeError, "string frozen");
 343     }
 344 }
 345
 346 size_t
 347 rb_str_capacity(VALUE str)
 348 {
 349     if (STR_EMBED_P(str)) {
 350         return RSTRING_EMBED_LEN_MAX;
 351     }
 352     else if (STR_NOCAPA_P(str)) {
 353         return RSTRING(str)->as.heap.len;
 354     }
 355     else {
 356         return RSTRING(str)->as.heap.aux.capa;
 357     }
 358 }
 359
 360 static inline VALUE
 361 str_alloc(VALUE klass)
 362 {
 363     NEWOBJ(str, struct RString);
 364     OBJSETUP(str, klass, T_STRING);
 365
 366     str->as.heap.ptr = 0;
 367     str->as.heap.len = 0;
 368     str->as.heap.aux.capa = 0;
 369
 370     return (VALUE)str;
 371 }
 372
 373 static VALUE
 374 str_new(VALUE klass, const char *ptr, long len)
 375 {
 376     VALUE str;
 377
 378     if (len < 0) {
 379         rb_raise(rb_eArgError, "negative string size (or size too big)");
 380     }
 381
 382     str = str_alloc(klass);
 383     if (len > RSTRING_EMBED_LEN_MAX) {
 384         RSTRING(str)->as.heap.aux.capa = len;
 385         RSTRING(str)->as.heap.ptr = ALLOC_N(char,len+1);
 386         STR_SET_NOEMBED(str);
 387     }
 388     if (ptr) {
 389         memcpy(RSTRING_PTR(str), ptr, len);
 390     }
 391     STR_SET_LEN(str, len);
 392     RSTRING_PTR(str)[len] = '\0';
 393     return str;
 394 }
 395
 396 VALUE
 397 rb_str_new(const char *ptr, long len)
 398 {
 399     return str_new(rb_cString, ptr, len);
 400 }
 401
 402 VALUE
 403 rb_usascii_str_new(const char *ptr, long len)
 404 {
 405     VALUE str = rb_str_new(ptr, len);
 406     ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
 407     return str;
 408 }
 409
 410 VALUE
 411 rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
 412 {
 413     VALUE str = rb_str_new(ptr, len);
 414     rb_enc_associate(str, enc);
 415     return str;
 416 }
 417
 418 VALUE
 419 rb_str_new2(const char *ptr)
 420 {
 421     if (!ptr) {
 422         rb_raise(rb_eArgError, "NULL pointer given");
 423     }
 424     return rb_str_new(ptr, strlen(ptr));
 425 }
 426
 427 VALUE
 428 rb_usascii_str_new2(const char *ptr)
 429 {
 430     VALUE str = rb_str_new2(ptr);
 431     ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
 432     return str;
 433 }
 434
 435 VALUE
 436 rb_tainted_str_new(const char *ptr, long len)
 437 {
 438     VALUE str = rb_str_new(ptr, len);
 439
 440     OBJ_TAINT(str);
 441     return str;
 442 }
 443
 444 VALUE
 445 rb_tainted_str_new2(const char *ptr)
 446 {
 447     VALUE str = rb_str_new2(ptr);
 448
 449     OBJ_TAINT(str);
 450     return str;
 451 }
 452
 453 static VALUE
 454 str_replace_shared(VALUE str2, VALUE str)
 455 {
 456     if (RSTRING_LEN(str) <= RSTRING_EMBED_LEN_MAX) {
 457         STR_SET_EMBED(str2);
 458         memcpy(RSTRING_PTR(str2), RSTRING_PTR(str), RSTRING_LEN(str)+1);
 459         STR_SET_EMBED_LEN(str2, RSTRING_LEN(str));
 460     }
 461     else {
 462         FL_SET(str2, STR_NOEMBED);
 463         RSTRING(str2)->as.heap.len = RSTRING_LEN(str);
 464         RSTRING(str2)->as.heap.ptr = RSTRING_PTR(str);
 465         RSTRING(str2)->as.heap.aux.shared = str;
 466         FL_SET(str2, ELTS_SHARED);
 467     }
 468     rb_enc_cr_str_exact_copy(str2, str);
 469
 470     return str2;
 471 }
 472
 473 static VALUE
 474 str_new_shared(VALUE klass, VALUE str)
 475 {
 476     return str_replace_shared(str_alloc(klass), str);
 477 }
 478
 479 static VALUE
 480 str_new3(VALUE klass, VALUE str)
 481 {
 482     return str_new_shared(klass, str);
 483 }
 484
 485 VALUE
 486 rb_str_new3(VALUE str)
 487 {
 488     VALUE str2 = str_new3(rb_obj_class(str), str);
 489
 490     OBJ_INFECT(str2, str);
 491     return str2;
 492 }
 493
 494 static VALUE
 495 str_new4(VALUE klass, VALUE str)
 496 {
 497     VALUE str2;
 498
 499     str2 = str_alloc(klass);
 500     STR_SET_NOEMBED(str2);
 501     RSTRING(str2)->as.heap.len = RSTRING_LEN(str);
 502     RSTRING(str2)->as.heap.ptr = RSTRING_PTR(str);
 503     if (STR_SHARED_P(str)) {
 504         FL_SET(str2, ELTS_SHARED);
 505         RSTRING(str2)->as.heap.aux.shared = RSTRING(str)->as.heap.aux.shared;
 506     }
 507     else {
 508         FL_SET(str, ELTS_SHARED);
 509         RSTRING(str)->as.heap.aux.shared = str2;
 510     }
 511     rb_enc_cr_str_exact_copy(str2, str);
 512     OBJ_INFECT(str2, str);
 513     return str2;
 514 }
 515
 516 VALUE
 517 rb_str_new4(VALUE orig)
 518 {
 519     VALUE klass, str;
 520
 521     if (OBJ_FROZEN(orig)) return orig;
 522     klass = rb_obj_class(orig);
 523     if (STR_SHARED_P(orig) && (str = RSTRING(orig)->as.heap.aux.shared)) {
 524         long ofs;
 525         ofs = RSTRING_LEN(str) - RSTRING_LEN(orig);
 526         if ((ofs > 0) || (klass != RBASIC(str)->klass) ||
 527             (!OBJ_TAINTED(str) && OBJ_TAINTED(orig))) {
 528             str = str_new3(klass, str);
 529             RSTRING(str)->as.heap.ptr += ofs;
 530             RSTRING(str)->as.heap.len -= ofs;
 531         }
 532         rb_enc_cr_str_exact_copy(str, orig);
 533         OBJ_INFECT(str, orig);
 534     }
 535     else if (STR_EMBED_P(orig)) {
 536         str = str_new(klass, RSTRING_PTR(orig), RSTRING_LEN(orig));
 537         rb_enc_cr_str_exact_copy(str, orig);
 538         OBJ_INFECT(str, orig);
 539     }
 540     else if (STR_ASSOC_P(orig)) {
 541         VALUE assoc = RSTRING(orig)->as.heap.aux.shared;
 542         FL_UNSET(orig, STR_ASSOC);
 543         str = str_new4(klass, orig);
 544         FL_SET(str, STR_ASSOC);
 545         RSTRING(str)->as.heap.aux.shared = assoc;
 546     }
 547     else {
 548         str = str_new4(klass, orig);
 549     }
 550     OBJ_FREEZE(str);
 551     return str;
 552 }
 553
 554 VALUE
 555 rb_str_new5(VALUE obj, const char *ptr, long len)
 556 {
 557     return str_new(rb_obj_class(obj), ptr, len);
 558 }
 559
 560 #define STR_BUF_MIN_SIZE 128
 561
 562 VALUE
 563 rb_str_buf_new(long capa)
 564 {
 565     VALUE str = str_alloc(rb_cString);
 566
 567     if (capa < STR_BUF_MIN_SIZE) {
 568         capa = STR_BUF_MIN_SIZE;
 569     }
 570     FL_SET(str, STR_NOEMBED);
 571     RSTRING(str)->as.heap.aux.capa = capa;
 572     RSTRING(str)->as.heap.ptr = ALLOC_N(char, capa+1);
 573     RSTRING(str)->as.heap.ptr[0] = '\0';
 574
 575     return str;
 576 }
 577
 578 VALUE
 579 rb_str_buf_new2(const char *ptr)
 580 {
 581     VALUE str;
 582     long len = strlen(ptr);
 583
 584     str = rb_str_buf_new(len);
 585     rb_str_buf_cat(str, ptr, len);
 586
 587     return str;
 588 }
 589
 590 VALUE
 591 rb_str_tmp_new(long len)
 592 {
 593     return str_new(0, 0, len);
 594 }
 595
 596 void
 597 rb_str_free(VALUE str)
 598 {
 599     if (!STR_EMBED_P(str) && !STR_SHARED_P(str)) {
 600         xfree(RSTRING(str)->as.heap.ptr);
 601     }
 602 }
 603
 604 VALUE
 605 rb_str_to_str(VALUE str)
 606 {
 607     return rb_convert_type(str, T_STRING, "String", "to_str");
 608 }
 609
 610 void
 611 rb_str_shared_replace(VALUE str, VALUE str2)
 612 {
 613     rb_encoding *enc;
 614     int cr;
 615     if (str == str2) return;
 616     enc = STR_ENC_GET(str2);
 617     cr = ENC_CODERANGE(str2);
 618     rb_str_modify(str);
 619     OBJ_INFECT(str, str2);
 620     if (!STR_SHARED_P(str) && !STR_EMBED_P(str)) {
 621         xfree(RSTRING_PTR(str));
 622     }
 623     if (RSTRING_LEN(str2) <= RSTRING_EMBED_LEN_MAX) {
 624         STR_SET_EMBED(str);
 625         memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), RSTRING_LEN(str2)+1);
 626         STR_SET_EMBED_LEN(str, RSTRING_LEN(str2));
 627         rb_enc_associate(str, enc);
 628         ENC_CODERANGE_SET(str, cr);
 629         return;
 630     }
 631     STR_SET_NOEMBED(str);
 632     STR_UNSET_NOCAPA(str);
 633     RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
 634     RSTRING(str)->as.heap.len = RSTRING_LEN(str2);
 635     if (STR_NOCAPA_P(str2)) {
 636         FL_SET(str, RBASIC(str2)->flags & STR_NOCAPA);
 637         RSTRING(str)->as.heap.aux.shared = RSTRING(str2)->as.heap.aux.shared;
 638     }
 639     else {
 640         RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
 641     }
 642     RSTRING(str2)->as.heap.ptr = 0;     /* abandon str2 */
 643     RSTRING(str2)->as.heap.len = 0;
 644     RSTRING(str2)->as.heap.aux.capa = 0;
 645     STR_UNSET_NOCAPA(str2);
 646     rb_enc_associate(str, enc);
 647     ENC_CODERANGE_SET(str, cr);
 648 }
 649
 650 static ID id_to_s;
 651
 652 VALUE
 653 rb_obj_as_string(VALUE obj)
 654 {
 655     VALUE str;
 656
 657     if (TYPE(obj) == T_STRING) {
 658         return obj;
 659     }
 660     str = rb_funcall(obj, id_to_s, 0);
 661     if (TYPE(str) != T_STRING)
 662         return rb_any_to_s(obj);
 663     if (OBJ_TAINTED(obj)) OBJ_TAINT(str);
 664     return str;
 665 }
 666
 667 static VALUE rb_str_replace(VALUE, VALUE);
 668
 669 VALUE
 670 rb_str_dup(VALUE str)
 671 {
 672     VALUE dup = str_alloc(rb_obj_class(str));
 673     rb_str_replace(dup, str);
 674     return dup;
 675 }
 676
 677
 678 /*
 679  *  call-seq:
 680  *     String.new(str="")   => new_str
 681  *
 682  *  Returns a new string object containing a copy of <i>str</i>.
 683  */
 684
 685 static VALUE
 686 rb_str_init(int argc, VALUE *argv, VALUE str)
 687 {
 688     VALUE orig;
 689
 690     if (argc > 0 && rb_scan_args(argc, argv, "01", &orig) == 1)
 691         rb_str_replace(str, orig);
 692     return str;
 693 }
 694
 695 long
 696 rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
 697 {
 698     long c;
 699     const char *q;
 700
 701     if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
 702         return (e - p + rb_enc_mbminlen(enc) - 1) / rb_enc_mbminlen(enc);
 703     }
 704     else if (rb_enc_asciicompat(enc)) {
 705         c = 0;
 706         while (p < e) {
 707             if (ISASCII(*p)) {
 708                 q = search_nonascii(p, e);
 709                 if (!q)
 710                     return c + (e - p);
 711                 c += q - p;
 712                 p = q;
 713             }
 714             p += rb_enc_mbclen(p, e, enc);
 715             c++;
 716         }
 717         return c;
 718     }
 719
 720     for (c=0; p<e; c++) {
 721         p += rb_enc_mbclen(p, e, enc);
 722     }
 723     return c;
 724 }
 725
 726 long
 727 rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
 728 {
 729     long c;
 730     const char *q;
 731     int ret;
 732
 733     *cr = 0;
 734     if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
 735         return (e - p + rb_enc_mbminlen(enc) - 1) / rb_enc_mbminlen(enc);
 736     }
 737     else if (rb_enc_asciicompat(enc)) {
 738         c = 0;
 739         while (p < e) {
 740             if (ISASCII(*p)) {
 741                 q = search_nonascii(p, e);
 742                 if (!q) {
 743                     if (!*cr) *cr = ENC_CODERANGE_7BIT;
 744                     return c + (e - p);
 745                 }
 746                 c += q - p;
 747                 p = q;
 748             }
 749             ret = rb_enc_precise_mbclen(p, e, enc);
 750             if (MBCLEN_CHARFOUND_P(ret)) {
 751                 *cr |= ENC_CODERANGE_VALID;
 752                 p += MBCLEN_CHARFOUND_LEN(ret);
 753             }
 754             else {
 755                 *cr = ENC_CODERANGE_BROKEN;
 756                 p++;
 757             }
 758             c++;
 759         }
 760         if (!*cr) *cr = ENC_CODERANGE_7BIT;
 761         return c;
 762     }
 763
 764     for (c=0; p<e; c++) {
 765         ret = rb_enc_precise_mbclen(p, e, enc);
 766         if (MBCLEN_CHARFOUND_P(ret)) {
 767             *cr |= ENC_CODERANGE_VALID;
 768             p += MBCLEN_CHARFOUND_LEN(ret);
 769         }
 770         else {
 771             *cr = ENC_CODERANGE_BROKEN;
 772             p++;
 773         }
 774     }
 775     if (!*cr) *cr = ENC_CODERANGE_7BIT;
 776     return c;
 777 }
 778
 779 #ifdef NONASCII_MASK
 780 #define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
 781 static inline VALUE
 782 count_utf8_lead_bytes_with_word(const VALUE *s)
 783 {
 784     VALUE d = *s;
 785     d |= ~(d>>1);
 786     d >>= 6;
 787     d &= NONASCII_MASK >> 7;
 788     d += (d>>8);
 789     d += (d>>16);
 790 #if SIZEOF_VALUE == 8
 791     d += (d>>32);
 792 #endif
 793     return (d&0xF);
 794 }
 795 #endif
 796
 797 static long
 798 str_strlen(VALUE str, rb_encoding *enc)
 799 {
 800     const char *p, *e;
 801     int n, cr;
 802
 803     if (single_byte_optimizable(str)) return RSTRING_LEN(str);
 804     if (!enc) enc = STR_ENC_GET(str);
 805     p = RSTRING_PTR(str);
 806     e = RSTRING_END(str);
 807 #ifdef NONASCII_MASK
 808     if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
 809         enc == rb_utf8_encoding()) {
 810         VALUE len = 0;
 811         if (sizeof(VALUE) * 2 < e - p) {
 812             const VALUE *s, *t;
 813             const VALUE lowbits = sizeof(VALUE) - 1;
 814             s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
 815             t = (const VALUE*)(~lowbits & (VALUE)e);
 816             while (p < (const char *)s) {
 817                 if (is_utf8_lead_byte(*p)) len++;
 818                 p++;
 819             }
 820             while (s < t) {
 821                 len += count_utf8_lead_bytes_with_word(s);
 822                 s++;
 823             }
 824             p = (const char *)s;
 825         }
 826         while (p < e) {
 827             if (is_utf8_lead_byte(*p)) len++;
 828             p++;
 829         }
 830         return (long)len;
 831     }
 832 #endif
 833     n = rb_enc_strlen_cr(p, e, enc, &cr);
 834     if (cr) {
 835         ENC_CODERANGE_SET(str, cr);
 836     }
 837     return n;
 838 }
 839
 840 /*
 841  *  call-seq:
 842  *     str.length   => integer
 843  *     str.size     => integer
 844  *
 845  *  Returns the character length of <i>str</i>.
 846  */
 847
 848 VALUE
 849 rb_str_length(VALUE str)
 850 {
 851     int len;
 852
 853     len = str_strlen(str, STR_ENC_GET(str));
 854     return INT2NUM(len);
 855 }
 856
 857 /*
 858  *  call-seq:
 859  *     str.bytesize  => integer
 860  *
 861  *  Returns the length of <i>str</i> in bytes.
 862  */
 863
 864 static VALUE
 865 rb_str_bytesize(VALUE str)
 866 {
 867     return INT2NUM(RSTRING_LEN(str));
 868 }
 869
 870 /*
 871  *  call-seq:
 872  *     str.empty?   => true or false
 873  *
 874  *  Returns <code>true</code> if <i>str</i> has a length of zero.
 875  *
 876  *     "hello".empty?   #=> false
 877  *     "".empty?        #=> true
 878  */
 879
 880 static VALUE
 881 rb_str_empty(VALUE str)
 882 {
 883     if (RSTRING_LEN(str) == 0)
 884         return Qtrue;
 885     return Qfalse;
 886 }
 887
 888 /*
 889  *  call-seq:
 890  *     str + other_str   => new_str
 891  *
 892  *  Concatenation---Returns a new <code>String</code> containing
 893  *  <i>other_str</i> concatenated to <i>str</i>.
 894  *
 895  *     "Hello from " + self.to_s   #=> "Hello from main"
 896  */
 897
 898 VALUE
 899 rb_str_plus(VALUE str1, VALUE str2)
 900 {
 901     VALUE str3;
 902     rb_encoding *enc;
 903
 904     StringValue(str2);
 905     enc = rb_enc_check(str1, str2);
 906     str3 = rb_str_new(0, RSTRING_LEN(str1)+RSTRING_LEN(str2));
 907     memcpy(RSTRING_PTR(str3), RSTRING_PTR(str1), RSTRING_LEN(str1));
 908     memcpy(RSTRING_PTR(str3) + RSTRING_LEN(str1),
 909            RSTRING_PTR(str2), RSTRING_LEN(str2));
 910     RSTRING_PTR(str3)[RSTRING_LEN(str3)] = '\0';
 911
 912     if (OBJ_TAINTED(str1) || OBJ_TAINTED(str2))
 913         OBJ_TAINT(str3);
 914     ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc),
 915                            ENC_CODERANGE_AND(ENC_CODERANGE(str1), ENC_CODERANGE(str2)));
 916     return str3;
 917 }
 918
 919 /*
 920  *  call-seq:
 921  *     str * integer   => new_str
 922  *
 923  *  Copy---Returns a new <code>String</code> containing <i>integer</i> copies of
 924  *  the receiver.
 925  *
 926  *     "Ho! " * 3   #=> "Ho! Ho! Ho! "
 927  */
 928
 929 VALUE
 930 rb_str_times(VALUE str, VALUE times)
 931 {
 932     VALUE str2;
 933     long n, len;
 934
 935     len = NUM2LONG(times);
 936     if (len < 0) {
 937         rb_raise(rb_eArgError, "negative argument");
 938     }
 939     if (len && LONG_MAX/len <  RSTRING_LEN(str)) {
 940         rb_raise(rb_eArgError, "argument too big");
 941     }
 942
 943     str2 = rb_str_new5(str, 0, len *= RSTRING_LEN(str));
 944     if (len) {
 945         n = RSTRING_LEN(str);
 946         memcpy(RSTRING_PTR(str2), RSTRING_PTR(str), n);
 947         while (n <= len/2) {
 948             memcpy(RSTRING_PTR(str2) + n, RSTRING_PTR(str2), n);
 949             n *= 2;
 950         }
 951         memcpy(RSTRING_PTR(str2) + n, RSTRING_PTR(str2), len-n);
 952     }
 953     RSTRING_PTR(str2)[RSTRING_LEN(str2)] = '\0';
 954     OBJ_INFECT(str2, str);
 955     rb_enc_cr_str_copy_for_substr(str2, str);
 956
 957     return str2;
 958 }
 959
 960 /*
 961  *  call-seq:
 962  *     str % arg   => new_str
 963  *
 964  *  Format---Uses <i>str</i> as a format specification, and returns the result
 965  *  of applying it to <i>arg</i>. If the format specification contains more than
 966  *  one substitution, then <i>arg</i> must be an <code>Array</code> containing
 967  *  the values to be substituted. See <code>Kernel::sprintf</code> for details
 968  *  of the format string.
 969  *
 970  *     "%05d" % 123                              #=> "00123"
 971  *     "%-5s: %08x" % [ "ID", self.object_id ]   #=> "ID   : 200e14d6"
 972  */
 973
 974 static VALUE
 975 rb_str_format_m(VALUE str, VALUE arg)
 976 {
 977     volatile VALUE tmp = rb_check_array_type(arg);
 978
 979     if (!NIL_P(tmp)) {
 980         return rb_str_format(RARRAY_LEN(tmp), RARRAY_PTR(tmp), str);
 981     }
 982     return rb_str_format(1, &arg, str);
 983 }
 984
 985 static inline void
 986 str_modifiable(VALUE str)
 987 {
 988     if (FL_TEST(str, STR_TMPLOCK)) {
 989         rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
 990     }
 991     if (OBJ_FROZEN(str)) rb_error_frozen("string");
 992     if (!OBJ_TAINTED(str) && rb_safe_level() >= 4)
 993         rb_raise(rb_eSecurityError, "Insecure: can't modify string");
 994 }
 995
 996 static inline int
 997 str_independent(VALUE str)
 998 {
 999     str_modifiable(str);
1000     if (!STR_SHARED_P(str)) return 1;
1001     if (STR_EMBED_P(str)) return 1;
1002     return 0;
1003 }
1004
1005 static void
1006 str_make_independent(VALUE str)
1007 {
1008     char *ptr;
1009     long len = RSTRING_LEN(str);
1010
1011     ptr = ALLOC_N(char, len+1);
1012     if (RSTRING_PTR(str)) {
1013         memcpy(ptr, RSTRING_PTR(str), len);
1014     }
1015     STR_SET_NOEMBED(str);
1016     ptr[len] = 0;
1017     RSTRING(str)->as.heap.ptr = ptr;
1018     RSTRING(str)->as.heap.len = len;
1019     RSTRING(str)->as.heap.aux.capa = len;
1020     STR_UNSET_NOCAPA(str);
1021 }
1022
1023 void
1024 rb_str_modify(VALUE str)
1025 {
1026     if (!str_independent(str))
1027         str_make_independent(str);
1028     ENC_CODERANGE_CLEAR(str);
1029 }
1030
1031 void
1032 rb_str_associate(VALUE str, VALUE add)
1033 {
1034     /* sanity check */
1035     if (OBJ_FROZEN(str)) rb_error_frozen("string");
1036     if (STR_ASSOC_P(str)) {
1037         /* already associated */
1038         rb_ary_concat(RSTRING(str)->as.heap.aux.shared, add);
1039     }
1040     else {
1041         if (STR_SHARED_P(str)) {
1042             VALUE assoc = RSTRING(str)->as.heap.aux.shared;
1043             str_make_independent(str);
1044             if (STR_ASSOC_P(assoc)) {
1045                 assoc = RSTRING(assoc)->as.heap.aux.shared;
1046                 rb_ary_concat(assoc, add);
1047                 add = assoc;
1048             }
1049         }
1050         else if (STR_EMBED_P(str)) {
1051             str_make_independent(str);
1052         }
1053         else if (RSTRING(str)->as.heap.aux.capa != RSTRING_LEN(str)) {
1054             RESIZE_CAPA(str, RSTRING_LEN(str));
1055         }
1056         FL_SET(str, STR_ASSOC);
1057         RBASIC(add)->klass = 0;
1058         RSTRING(str)->as.heap.aux.shared = add;
1059     }
1060 }
1061
1062 VALUE
1063 rb_str_associated(VALUE str)
1064 {
1065     if (STR_SHARED_P(str)) str = RSTRING(str)->as.heap.aux.shared;
1066     if (STR_ASSOC_P(str)) {
1067         return RSTRING(str)->as.heap.aux.shared;
1068     }
1069     return Qfalse;
1070 }
1071
1072 VALUE
1073 rb_string_value(volatile VALUE *ptr)
1074 {
1075     VALUE s = *ptr;
1076     if (TYPE(s) != T_STRING) {
1077         s = rb_str_to_str(s);
1078         *ptr = s;
1079     }
1080     return s;
1081 }
1082
1083 char *
1084 rb_string_value_ptr(volatile VALUE *ptr)
1085 {
1086     return RSTRING_PTR(rb_string_value(ptr));
1087 }
1088
1089 char *
1090 rb_string_value_cstr(volatile VALUE *ptr)
1091 {
1092     VALUE str = rb_string_value(ptr);
1093     char *s = RSTRING_PTR(str);
1094
1095     if (!s || RSTRING_LEN(str) != strlen(s)) {
1096         rb_raise(rb_eArgError, "string contains null byte");
1097     }
1098     return s;
1099 }
1100
1101 VALUE
1102 rb_check_string_type(VALUE str)
1103 {
1104     str = rb_check_convert_type(str, T_STRING, "String", "to_str");
1105     return str;
1106 }
1107
1108 /*
1109  *  call-seq:
1110  *     String.try_convert(obj) -> string or nil
1111  *
1112  *  Try to convert <i>obj</i> into a String, using to_str method.
1113  *  Returns converted regexp or nil if <i>obj</i> cannot be converted
1114  *  for any reason.
1115  *
1116  *     String.try_convert("str")     # => str
1117  *     String.try_convert(/re/)      # => nil
1118  */
1119 static VALUE
1120 rb_str_s_try_convert(VALUE dummy, VALUE str)
1121 {
1122     return rb_check_string_type(str);
1123 }
1124
1125 char*
1126 rb_enc_nth(const char *p, const char *e, int nth, rb_encoding *enc)
1127 {
1128     if (rb_enc_mbmaxlen(enc) == 1) {
1129         p += nth;
1130     }
1131     else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
1132         p += nth * rb_enc_mbmaxlen(enc);
1133     }
1134     else if (rb_enc_asciicompat(enc)) {
1135         const char *p2, *e2;
1136         int n;
1137
1138         while (p < e && 0 < nth) {
1139             e2 = p + nth;
1140             if (e < e2)
1141                 return (char *)e;
1142             if (ISASCII(*p)) {
1143                 p2 = search_nonascii(p, e2);
1144                 if (!p2)
1145                     return (char *)e2;
1146                 nth -= p2 - p;
1147                 p = p2;
1148             }
1149             n = rb_enc_mbclen(p, e, enc);
1150             p += n;
1151             nth--;
1152         }
1153         if (nth != 0)
1154             return (char *)e;
1155         return (char *)p;
1156     }
1157     else {
1158         while (p<e && nth--) {
1159             p += rb_enc_mbclen(p, e, enc);
1160         }
1161     }
1162     if (p > e) p = e;
1163     return (char*)p;
1164 }
1165
1166 static char*
1167 str_nth(const char *p, const char *e, int nth, rb_encoding *enc, int singlebyte)
1168 {
1169     if (singlebyte)
1170         p += nth;
1171     else {
1172         p = rb_enc_nth(p, e, nth, enc);
1173     }
1174     if (!p) return 0;
1175     if (p > e) p = e;
1176     return (char *)p;
1177 }
1178
1179 /* char offset to byte offset */
1180 static int
1181 str_offset(const char *p, const char *e, int nth, rb_encoding *enc, int singlebyte)
1182 {
1183     const char *pp = str_nth(p, e, nth, enc, singlebyte);
1184     if (!pp) return e - p;
1185     return pp - p;
1186 }
1187
1188 #ifdef NONASCII_MASK
1189 static char *
1190 str_utf8_nth(const char *p, const char *e, int nth)
1191 {
1192     if (sizeof(VALUE) * 2 < nth) {
1193         const VALUE *s, *t;
1194         const VALUE lowbits = sizeof(VALUE) - 1;
1195         s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
1196         t = (const VALUE*)(~lowbits & (VALUE)e);
1197         while (p < (const char *)s) {
1198             if (is_utf8_lead_byte(*p)) nth--;
1199             p++;
1200         }
1201         do {
1202             nth -= count_utf8_lead_bytes_with_word(s);
1203             s++;
1204         } while (s < t && sizeof(VALUE) <= nth);
1205         p = (char *)s;
1206     }
1207     while (p < e) {
1208         if (is_utf8_lead_byte(*p)) {
1209             if (nth == 0) break;
1210             nth--;
1211         }
1212         p++;
1213     }
1214     return (char *)p;
1215 }
1216
1217 static int
1218 str_utf8_offset(const char *p, const char *e, int nth)
1219 {
1220     const char *pp = str_utf8_nth(p, e, nth);
1221     if (!pp) return e - p;
1222     return pp - p;
1223 }
1224 #endif
1225
1226 /* byte offset to char offset */
1227 long
1228 rb_str_sublen(VALUE str, long pos)
1229 {
1230     if (single_byte_optimizable(str) || pos < 0)
1231         return pos;
1232     else {
1233         char *p = RSTRING_PTR(str);
1234         return rb_enc_strlen(p, p + pos, STR_ENC_GET(str));
1235     }
1236 }
1237
1238 VALUE
1239 rb_str_subseq(VALUE str, long beg, long len)
1240 {
1241     VALUE str2 = rb_str_new5(str, RSTRING_PTR(str)+beg, len);
1242
1243     rb_enc_cr_str_copy_for_substr(str2, str);
1244     OBJ_INFECT(str2, str);
1245
1246     return str2;
1247 }
1248
1249 VALUE
1250 rb_str_substr(VALUE str, long beg, long len)
1251 {
1252     rb_encoding *enc = STR_ENC_GET(str);
1253     VALUE str2;
1254     char *p, *s = RSTRING_PTR(str), *e = s + RSTRING_LEN(str);
1255     int singlebyte;
1256
1257     if (len < 0) return Qnil;
1258     if (!RSTRING_LEN(str)) {
1259         len = 0;
1260     }
1261     if (beg < 0) {
1262         if (len > -beg) len = -beg;
1263         if (-beg * rb_enc_mbmaxlen(enc) < RSTRING_LEN(str) / 8) {
1264             beg = -beg;
1265             while (beg-- > len && (e = rb_enc_prev_char(s, e, enc)) != 0);
1266             p = e;
1267             if (!p) return Qnil;
1268             while (len-- > 0 && (p = rb_enc_prev_char(s, p, enc)) != 0);
1269             if (!p) return Qnil;
1270             len = e - p;
1271             goto sub;
1272         }
1273         else {
1274             beg += str_strlen(str, enc);
1275             if (beg < 0) return Qnil;
1276         }
1277     }
1278     else if (beg > 0 && beg > str_strlen(str, enc)) {
1279         return Qnil;
1280     }
1281     singlebyte = single_byte_optimizable(str);
1282     if (len == 0) {
1283         p = 0;
1284     }
1285 #ifdef NONASCII_MASK
1286     else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
1287         enc == rb_utf8_encoding()) {
1288         p = str_utf8_nth(s, e, beg);
1289         len = str_utf8_offset(p, e, len);
1290     }
1291 #endif
1292     else if ((p = str_nth(s, e, beg, enc, singlebyte)) == e) {
1293         len = 0;
1294     }
1295     else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
1296         if (len * rb_enc_mbmaxlen(enc) > e - p)
1297             len = e - p;
1298         else
1299             len *= rb_enc_mbmaxlen(enc);
1300     }
1301     else {
1302         len = str_offset(p, e, len, enc, singlebyte);
1303     }
1304   sub:
1305     if (len > RSTRING_EMBED_LEN_MAX && beg + len == RSTRING_LEN(str)) {
1306         str2 = rb_str_new4(str);
1307         str2 = str_new3(rb_obj_class(str2), str2);
1308         RSTRING(str2)->as.heap.ptr += RSTRING(str2)->as.heap.len - len;
1309         RSTRING(str2)->as.heap.len = len;
1310     }
1311     else {
1312         str2 = rb_str_new5(str, p, len);
1313         rb_enc_cr_str_copy_for_substr(str2, str);
1314         OBJ_INFECT(str2, str);
1315     }
1316
1317     return str2;
1318 }
1319
1320 VALUE
1321 rb_str_freeze(VALUE str)
1322 {
1323     if (STR_ASSOC_P(str)) {
1324         VALUE ary = RSTRING(str)->as.heap.aux.shared;
1325         OBJ_FREEZE(ary);
1326     }
1327     return rb_obj_freeze(str);
1328 }
1329
1330 VALUE
1331 rb_str_dup_frozen(VALUE str)
1332 {
1333     if (STR_SHARED_P(str) && RSTRING(str)->as.heap.aux.shared) {
1334         VALUE shared = RSTRING(str)->as.heap.aux.shared;
1335         if (RSTRING_LEN(shared) == RSTRING_LEN(str)) {
1336             OBJ_FREEZE(shared);
1337             return shared;
1338         }
1339     }
1340     if (OBJ_FROZEN(str)) return str;
1341     str = rb_str_dup(str);
1342     OBJ_FREEZE(str);
1343     return str;
1344 }
1345
1346 VALUE
1347 rb_str_locktmp(VALUE str)
1348 {
1349     if (FL_TEST(str, STR_TMPLOCK)) {
1350         rb_raise(rb_eRuntimeError, "temporal locking already locked string");
1351     }
1352     FL_SET(str, STR_TMPLOCK);
1353     return str;
1354 }
1355
1356 VALUE
1357 rb_str_unlocktmp(VALUE str)
1358 {
1359     if (!FL_TEST(str, STR_TMPLOCK)) {
1360         rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
1361     }
1362     FL_UNSET(str, STR_TMPLOCK);
1363     return str;
1364 }
1365
1366 void
1367 rb_str_set_len(VALUE str, long len)
1368 {
1369     STR_SET_LEN(str, len);
1370     RSTRING_PTR(str)[len] = '\0';
1371 }
1372
1373 VALUE
1374 rb_str_resize(VALUE str, long len)
1375 {
1376     long slen;
1377
1378     if (len < 0) {
1379         rb_raise(rb_eArgError, "negative string size (or size too big)");
1380     }
1381
1382     rb_str_modify(str);
1383     slen = RSTRING_LEN(str);
1384     if (len != slen) {
1385         if (STR_EMBED_P(str)) {
1386             char *ptr;
1387             if (len <= RSTRING_EMBED_LEN_MAX) {
1388                 STR_SET_EMBED_LEN(str, len);
1389                 RSTRING(str)->as.ary[len] = '\0';
1390                 return str;
1391             }
1392             ptr = ALLOC_N(char,len+1);
1393             MEMCPY(ptr, RSTRING(str)->as.ary, char, slen);
1394             RSTRING(str)->as.heap.ptr = ptr;
1395             STR_SET_NOEMBED(str);
1396         }
1397         else if (len <= RSTRING_EMBED_LEN_MAX) {
1398             char *ptr = RSTRING(str)->as.heap.ptr;
1399             STR_SET_EMBED(str);
1400             if (slen > 0) MEMCPY(RSTRING(str)->as.ary, ptr, char, len);
1401             RSTRING(str)->as.ary[len] = '\0';
1402             STR_SET_EMBED_LEN(str, len);
1403             xfree(ptr);
1404             return str;
1405         }
1406         else if (slen < len || slen - len > 1024) {
1407             REALLOC_N(RSTRING(str)->as.heap.ptr, char, len+1);
1408         }
1409         if (!STR_NOCAPA_P(str)) {
1410             RSTRING(str)->as.heap.aux.capa = len;
1411         }
1412         RSTRING(str)->as.heap.len = len;
1413         RSTRING(str)->as.heap.ptr[len] = '\0';  /* sentinel */
1414     }
1415     return str;
1416 }
1417
1418 static VALUE
1419 str_buf_cat(VALUE str, const char *ptr, long len)
1420 {
1421     long capa, total, off = -1;
1422
1423     if (ptr >= RSTRING_PTR(str) && ptr <= RSTRING_END(str)) {
1424         off = ptr - RSTRING_PTR(str);
1425     }
1426     rb_str_modify(str);
1427     if (len == 0) return 0;
1428     if (STR_ASSOC_P(str)) {
1429         FL_UNSET(str, STR_ASSOC);
1430         capa = RSTRING(str)->as.heap.aux.capa = RSTRING_LEN(str);
1431     }
1432     else if (STR_EMBED_P(str)) {
1433         capa = RSTRING_EMBED_LEN_MAX;
1434     }
1435     else {
1436         capa = RSTRING(str)->as.heap.aux.capa;
1437     }
1438     if (RSTRING_LEN(str) >= LONG_MAX - len) {
1439         rb_raise(rb_eArgError, "string sizes too big");
1440     }
1441     total = RSTRING_LEN(str)+len;
1442     if (capa <= total) {
1443         while (total > capa) {
1444             if (capa + 1 >= LONG_MAX / 2) {
1445                 capa = (total + 4095) / 4096;
1446                 break;
1447             }
1448             capa = (capa + 1) * 2;
1449         }
1450         RESIZE_CAPA(str, capa);
1451     }
1452     if (off != -1) {
1453         ptr = RSTRING_PTR(str) + off;
1454     }
1455     memcpy(RSTRING_PTR(str) + RSTRING_LEN(str), ptr, len);
1456     STR_SET_LEN(str, total);
1457     RSTRING_PTR(str)[total] = '\0'; /* sentinel */
1458
1459     return str;
1460 }
1461
1462 VALUE
1463 rb_str_buf_cat(VALUE str, const char *ptr, long len)
1464 {
1465     if (len == 0) return str;
1466     if (len < 0) {
1467         rb_raise(rb_eArgError, "negative string size (or size too big)");
1468     }
1469     return str_buf_cat(str, ptr, len);
1470 }
1471
1472 VALUE
1473 rb_str_buf_cat2(VALUE str, const char *ptr)
1474 {
1475     return rb_str_buf_cat(str, ptr, strlen(ptr));
1476 }
1477
1478 VALUE
1479 rb_str_cat(VALUE str, const char *ptr, long len)
1480 {
1481     if (len < 0) {
1482         rb_raise(rb_eArgError, "negative string size (or size too big)");
1483     }
1484     if (STR_ASSOC_P(str)) {
1485         rb_str_modify(str);
1486         if (STR_EMBED_P(str)) str_make_independent(str);
1487         REALLOC_N(RSTRING(str)->as.heap.ptr, char, RSTRING(str)->as.heap.len+len+1);
1488         memcpy(RSTRING(str)->as.heap.ptr + RSTRING(str)->as.heap.len, ptr, len);
1489         RSTRING(str)->as.heap.len += len;
1490         RSTRING(str)->as.heap.ptr[RSTRING(str)->as.heap.len] = '\0'; /* sentinel */
1491         return str;
1492     }
1493
1494     return rb_str_buf_cat(str, ptr, len);
1495 }
1496
1497 VALUE
1498 rb_str_cat2(VALUE str, const char *ptr)
1499 {
1500     return rb_str_cat(str, ptr, strlen(ptr));
1501 }
1502
1503 static VALUE
1504 rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
1505     int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
1506 {
1507     int str_encindex = ENCODING_GET(str);
1508     int res_encindex;
1509     int str_cr, res_cr;
1510     int str_a8 = ENCODING_IS_ASCII8BIT(str);
1511     int ptr_a8 = ptr_encindex == 0;
1512
1513     str_cr = ENC_CODERANGE(str);
1514
1515     if (str_encindex == ptr_encindex) {
1516         if (str_cr == ENC_CODERANGE_UNKNOWN ||
1517             (ptr_a8 && str_cr != ENC_CODERANGE_7BIT)) {
1518             ptr_cr = ENC_CODERANGE_UNKNOWN;
1519         }
1520         else if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
1521             ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
1522         }
1523     }
1524     else {
1525         rb_encoding *str_enc = rb_enc_from_index(str_encindex);
1526         rb_encoding *ptr_enc = rb_enc_from_index(ptr_encindex);
1527         if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
1528             if (len == 0)
1529                 return str;
1530             if (RSTRING_LEN(str) == 0) {
1531                 rb_str_buf_cat(str, ptr, len);
1532                 ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
1533                 return str;
1534             }
1535             goto incompatible;
1536         }
1537         if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
1538             ptr_cr = coderange_scan(ptr, len, ptr_enc);
1539         }
1540         if (str_cr == ENC_CODERANGE_UNKNOWN) {
1541             if (str_a8 || ptr_cr != ENC_CODERANGE_7BIT) {
1542                 str_cr = rb_enc_str_coderange(str);
1543             }
1544         }
1545     }
1546     if (ptr_cr_ret)
1547         *ptr_cr_ret = ptr_cr;
1548
1549     if (str_encindex != ptr_encindex &&
1550         str_cr != ENC_CODERANGE_7BIT &&
1551         ptr_cr != ENC_CODERANGE_7BIT) {
1552       incompatible:
1553         rb_raise(rb_eArgError, "append incompatible encoding strings: %s and %s",
1554             rb_enc_name(rb_enc_from_index(str_encindex)),
1555             rb_enc_name(rb_enc_from_index(ptr_encindex)));
1556     }
1557
1558     if (str_cr == ENC_CODERANGE_UNKNOWN) {
1559         res_encindex = str_encindex;
1560         res_cr = ENC_CODERANGE_UNKNOWN;
1561     }
1562     else if (str_cr == ENC_CODERANGE_7BIT) {
1563         if (ptr_cr == ENC_CODERANGE_7BIT) {
1564             res_encindex = !str_a8 ? str_encindex : ptr_encindex;
1565             res_cr = ENC_CODERANGE_7BIT;
1566         }
1567         else {
1568             res_encindex = ptr_encindex;
1569             res_cr = ptr_cr;
1570         }
1571     }
1572     else if (str_cr == ENC_CODERANGE_VALID) {
1573         res_encindex = str_encindex;
1574         res_cr = str_cr;
1575     }
1576     else { /* str_cr == ENC_CODERANGE_BROKEN */
1577         res_encindex = str_encindex;
1578         res_cr = str_cr;
1579         if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
1580     }
1581
1582     if (len < 0) {
1583         rb_raise(rb_eArgError, "negative string size (or size too big)");
1584     }
1585     str_buf_cat(str, ptr, len);
1586     ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
1587     return str;
1588 }
1589
1590 VALUE
1591 rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
1592 {
1593     return rb_enc_cr_str_buf_cat(str, ptr, len,
1594         rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL);
1595 }
1596
1597 VALUE
1598 rb_str_buf_cat_ascii(VALUE str, const char *ptr)
1599 {
1600     /* ptr must reference NUL terminated ASCII string. */
1601     int encindex = ENCODING_GET(str);
1602     rb_encoding *enc = rb_enc_from_index(encindex);
1603     if (rb_enc_asciicompat(enc)) {
1604         return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
1605             encindex, ENC_CODERANGE_7BIT, 0);
1606     }
1607     else {
1608         char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
1609         while (*ptr) {
1610             int c = (unsigned char)*ptr;
1611             int len = rb_enc_codelen(c, enc);
1612             rb_enc_mbcput(c, buf, enc);
1613             rb_enc_cr_str_buf_cat(str, buf, len,
1614                 encindex, ENC_CODERANGE_VALID, 0);
1615             ptr++;
1616         }
1617         return str;
1618     }
1619 }
1620
1621 VALUE
1622 rb_str_buf_append(VALUE str, VALUE str2)
1623 {
1624     int str2_cr;
1625
1626     str2_cr = ENC_CODERANGE(str2);
1627
1628     rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
1629         ENCODING_GET(str2), str2_cr, &str2_cr);
1630
1631     OBJ_INFECT(str, str2);
1632     ENC_CODERANGE_SET(str2, str2_cr);
1633
1634     return str;
1635 }
1636
1637 VALUE
1638 rb_str_append(VALUE str, VALUE str2)
1639 {
1640     rb_encoding *enc;
1641     int cr, cr2;
1642
1643     StringValue(str2);
1644     if (RSTRING_LEN(str2) > 0 && STR_ASSOC_P(str)) {
1645         long len = RSTRING_LEN(str)+RSTRING_LEN(str2);
1646         enc = rb_enc_check(str, str2);
1647         cr = ENC_CODERANGE(str);
1648         if ((cr2 = ENC_CODERANGE(str2)) > cr) cr = cr2;
1649         rb_str_modify(str);
1650         REALLOC_N(RSTRING(str)->as.heap.ptr, char, len+1);
1651         memcpy(RSTRING(str)->as.heap.ptr + RSTRING(str)->as.heap.len,
1652                RSTRING_PTR(str2), RSTRING_LEN(str2)+1);
1653         RSTRING(str)->as.heap.len = len;
1654         rb_enc_associate(str, enc);
1655         ENC_CODERANGE_SET(str, cr);
1656         OBJ_INFECT(str, str2);
1657         return str;
1658     }
1659     return rb_str_buf_append(str, str2);
1660 }
1661
1662
1663 /*
1664  *  call-seq:
1665  *     str << fixnum        => str
1666  *     str.concat(fixnum)   => str
1667  *     str << obj           => str
1668  *     str.concat(obj)      => str
1669  *
1670  *  Append---Concatenates the given object to <i>str</i>. If the object is a
1671  *  <code>Fixnum</code>, it is considered as a codepoint, and is converted
1672  *  to a character before concatenation.
1673  *
1674  *     a = "hello "
1675  *     a << "world"   #=> "hello world"
1676  *     a.concat(33)   #=> "hello world!"
1677  */
1678
1679 VALUE
1680 rb_str_concat(VALUE str1, VALUE str2)
1681 {
1682     if (FIXNUM_P(str2)) {
1683         rb_encoding *enc = STR_ENC_GET(str1);
1684         int c = FIX2INT(str2);
1685         int pos = RSTRING_LEN(str1);
1686         int len = rb_enc_codelen(c, enc);
1687         int cr = ENC_CODERANGE(str1);
1688
1689         rb_str_resize(str1, pos+len);
1690         rb_enc_mbcput(c, RSTRING_PTR(str1)+pos, enc);
1691         ENC_CODERANGE_SET(str1, cr);
1692         return str1;
1693     }
1694     return rb_str_append(str1, str2);
1695 }
1696
1697 #if defined __i386__ || defined _M_IX86
1698 #define UNALIGNED_WORD_ACCESS 1
1699 #endif
1700 #ifndef UNALIGNED_WORD_ACCESS
1701 #define UNALIGNED_WORD_ACCESS 0
1702 #endif
1703
1704 /* MurmurHash described in http://murmurhash.googlepages.com/ */
1705 static unsigned int
1706 hash(const unsigned char * data, int len, unsigned int h)
1707 {
1708     const unsigned int m = 0x7fd652ad;
1709     const int r = 16;
1710
1711     h += 0xdeadbeef;
1712
1713     if (len >= 4) {
1714 #if !UNALIGNED_WORD_ACCESS
1715         int align = (VALUE)data & 3;
1716         if (align) {
1717             uint32_t t = 0, d = 0;
1718             int sl, sr, pack;
1719
1720             switch (align) {
1721 #ifdef WORDS_BIGENDIAN
1722               case 1: t |= data[2];
1723               case 2: t |= data[1] << 8;
1724               case 3: t |= data[0] << 16;
1725 #else
1726               case 1: t |= data[2] << 16;
1727               case 2: t |= data[1] << 8;
1728               case 3: t |= data[0];
1729 #endif
1730             }
1731
1732 #ifdef WORDS_BIGENDIAN
1733             t >>= (8 * align) - 8;
1734 #else
1735             t <<= (8 * align);
1736 #endif
1737
1738             data += 4-align;
1739             len -= 4-align;
1740
1741             sl = 8 * (4-align);
1742             sr = 8 * align;
1743
1744             while (len >= 4) {
1745                 d = *(uint32_t *)data;
1746 #ifdef WORDS_BIGENDIAN
1747                 t = (t << sr) | (d >> sl);
1748 #else
1749                 t = (t >> sr) | (d << sl);
1750 #endif
1751                 h += t;
1752                 h *= m;
1753                 h ^= h >> r;
1754                 t = d;
1755
1756                 data += 4;
1757                 len -= 4;
1758             }
1759
1760             pack = len < align ? len : align;
1761             d = 0;
1762             switch (pack) {
1763 #ifdef WORDS_BIGENDIAN
1764               case 3: d |= data[2] << 8;
1765               case 2: d |= data[1] << 16;
1766               case 1: d |= data[0] << 24;
1767               case 0:
1768                 h += (t << sr) | (d >> sl);
1769 #else
1770               case 3: d |= data[2] << 16;
1771               case 2: d |= data[1] << 8;
1772               case 1: d |= data[0];
1773               case 0:
1774                 h += (t >> sr) | (d << sl);
1775 #endif
1776                 h *= m;
1777                 h ^= h >> r;
1778             }
1779
1780             data += pack;
1781             len -= pack;
1782         }
1783         else
1784 #endif
1785         {
1786             do {
1787                 h += *(uint32_t *)data;
1788                 h *= m;
1789                 h ^= h >> r;
1790
1791                 data += 4;
1792                 len -= 4;
1793             } while (len >= 4);
1794         }
1795     }
1796
1797     switch(len) {
1798 #ifdef WORDS_BIGENDIAN
1799       case 3:
1800         h += data[2] << 8;
1801       case 2:
1802         h += data[1] << 16;
1803       case 1:
1804         h += data[0] << 24;
1805 #else
1806       case 3:
1807         h += data[2] << 16;
1808       case 2:
1809         h += data[1] << 8;
1810       case 1:
1811         h += data[0];
1812 #endif
1813         h *= m;
1814         h ^= h >> r;
1815     }
1816
1817     h *= m;
1818     h ^= h >> 10;
1819     h *= m;
1820     h ^= h >> 17;
1821
1822     return h;
1823 }
1824
1825 int
1826 rb_memhash(const void *ptr, long len)
1827 {
1828     static int hashseed_init = 0;
1829     static unsigned int hashseed;
1830
1831     if (!hashseed_init) {
1832         hashseed = rb_genrand_int32();
1833         hashseed_init = 1;
1834     }
1835
1836     return hash(ptr, len, hashseed);
1837 }
1838
1839 int
1840 rb_str_hash(VALUE str)
1841 {
1842     return rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str));
1843 }
1844
1845 int
1846 rb_str_hash_cmp(VALUE str1, VALUE str2)
1847 {
1848     int len;
1849
1850     if (!rb_str_comparable(str1, str2)) return 1;
1851     if (RSTRING_LEN(str1) == (len = RSTRING_LEN(str2)) &&
1852         memcmp(RSTRING_PTR(str1), RSTRING_PTR(str2), len) == 0) {
1853         return 0;
1854     }
1855     return 1;
1856 }
1857
1858 /*
1859  * call-seq:
1860  *    str.hash   => fixnum
1861  *
1862  * Return a hash based on the string's length and content.
1863  */
1864
1865 static VALUE
1866 rb_str_hash_m(VALUE str)
1867 {
1868     int hval = rb_str_hash(str);
1869     return INT2FIX(hval);
1870 }
1871
1872 #define lesser(a,b) (((a)>(b))?(b):(a))
1873
1874 int
1875 rb_str_comparable(VALUE str1, VALUE str2)
1876 {
1877     int idx1, idx2;
1878     int rc1, rc2;
1879
1880     if (RSTRING_LEN(str1) == 0) return Qtrue;
1881     if (RSTRING_LEN(str2) == 0) return Qtrue;
1882     idx1 = ENCODING_GET(str1);
1883     idx2 = ENCODING_GET(str2);
1884     if (idx1 == idx2) return Qtrue;
1885     rc1 = rb_enc_str_coderange(str1);
1886     rc2 = rb_enc_str_coderange(str2);
1887     if (rc1 == ENC_CODERANGE_7BIT) {
1888         if (rc2 == ENC_CODERANGE_7BIT) return Qtrue;
1889         if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
1890             return Qtrue;
1891     }
1892     if (rc2 == ENC_CODERANGE_7BIT) {
1893         if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
1894             return Qtrue;
1895     }
1896     return Qfalse;
1897 }
1898
1899 int
1900 rb_str_cmp(VALUE str1, VALUE str2)
1901 {
1902     long len;
1903     int retval;
1904
1905     len = lesser(RSTRING_LEN(str1), RSTRING_LEN(str2));
1906     retval = memcmp(RSTRING_PTR(str1), RSTRING_PTR(str2), len);
1907     if (retval == 0) {
1908         if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) {
1909             if (!rb_enc_compatible(str1, str2)) {
1910                 if (ENCODING_GET(str1) - ENCODING_GET(str2) > 0)
1911                     return 1;
1912                 return -1;
1913             }
1914             return 0;
1915         }
1916         if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return 1;
1917         return -1;
1918     }
1919     if (retval > 0) return 1;
1920     return -1;
1921 }
1922
1923
1924 /*
1925  *  call-seq:
1926  *     str == obj   => true or false
1927  *
1928  *  Equality---If <i>obj</i> is not a <code>String</code>, returns
1929  *  <code>false</code>. Otherwise, returns <code>true</code> if <i>str</i>
1930  *  <code><=></code> <i>obj</i> returns zero.
1931  */
1932
1933 VALUE
1934 rb_str_equal(VALUE str1, VALUE str2)
1935 {
1936     int len;
1937
1938     if (str1 == str2) return Qtrue;
1939     if (TYPE(str2) != T_STRING) {
1940         if (!rb_respond_to(str2, rb_intern("to_str"))) {
1941             return Qfalse;
1942         }
1943         return rb_equal(str2, str1);
1944     }
1945     if (!rb_str_comparable(str1, str2)) return Qfalse;
1946     if (RSTRING_LEN(str1) == (len = RSTRING_LEN(str2)) &&
1947         memcmp(RSTRING_PTR(str1), RSTRING_PTR(str2), len) == 0) {
1948         return Qtrue;
1949     }
1950     return Qfalse;
1951 }
1952
1953 /*
1954  * call-seq:
1955  *   str.eql?(other)   => true or false
1956  *
1957  * Two strings are equal if the have the same length and content.
1958  */
1959
1960 static VALUE
1961 rb_str_eql(VALUE str1, VALUE str2)
1962 {
1963     if (TYPE(str2) != T_STRING || RSTRING_LEN(str1) != RSTRING_LEN(str2))
1964         return Qfalse;
1965
1966     if (!rb_str_comparable(str1, str2)) return Qfalse;
1967     if (memcmp(RSTRING_PTR(str1), RSTRING_PTR(str2),
1968                lesser(RSTRING_LEN(str1), RSTRING_LEN(str2))) == 0)
1969         return Qtrue;
1970
1971     return Qfalse;
1972 }
1973
1974 /*
1975  *  call-seq:
1976  *     str <=> other_str   => -1, 0, +1
1977  *
1978  *  Comparison---Returns -1 if <i>other_str</i> is less than, 0 if
1979  *  <i>other_str</i> is equal to, and +1 if <i>other_str</i> is greater than
1980  *  <i>str</i>. If the strings are of different lengths, and the strings are
1981  *  equal when compared up to the shortest length, then the longer string is
1982  *  considered greater than the shorter one. In older versions of Ruby, setting
1983  *  <code>$=</code> allowed case-insensitive comparisons; this is now deprecated
1984  *  in favor of using <code>String#casecmp</code>.
1985  *
1986  *  <code><=></code> is the basis for the methods <code><</code>,
1987  *  <code><=</code>, <code>></code>, <code>>=</code>, and <code>between?</code>,
1988  *  included from module <code>Comparable</code>.  The method
1989  *  <code>String#==</code> does not use <code>Comparable#==</code>.
1990  *
1991  *     "abcdef" <=> "abcde"     #=> 1
1992  *     "abcdef" <=> "abcdef"    #=> 0
1993  *     "abcdef" <=> "abcdefg"   #=> -1
1994  *     "abcdef" <=> "ABCDEF"    #=> 1
1995  */
1996
1997 static VALUE
1998 rb_str_cmp_m(VALUE str1, VALUE str2)
1999 {
2000     long result;
2001
2002     if (TYPE(str2) != T_STRING) {
2003         if (!rb_respond_to(str2, rb_intern("to_str"))) {
2004             return Qnil;
2005         }
2006         else if (!rb_respond_to(str2, rb_intern("<=>"))) {
2007             return Qnil;
2008         }
2009         else {
2010             VALUE tmp = rb_funcall(str2, rb_intern("<=>"), 1, str1);
2011
2012             if (NIL_P(tmp)) return Qnil;
2013             if (!FIXNUM_P(tmp)) {
2014                 return rb_funcall(LONG2FIX(0), '-', 1, tmp);
2015             }
2016             result = -FIX2LONG(tmp);
2017         }
2018     }
2019     else {
2020         result = rb_str_cmp(str1, str2);
2021     }
2022     return LONG2NUM(result);
2023 }
2024
2025 /*
2026  *  call-seq:
2027  *     str.casecmp(other_str)   => -1, 0, +1
2028  *
2029  *  Case-insensitive version of <code>String#<=></code>.
2030  *
2031  *     "abcdef".casecmp("abcde")     #=> 1
2032  *     "aBcDeF".casecmp("abcdef")    #=> 0
2033  *     "abcdef".casecmp("abcdefg")   #=> -1
2034  *     "abcdef".casecmp("ABCDEF")    #=> 0
2035  */
2036
2037 static VALUE
2038 rb_str_casecmp(VALUE str1, VALUE str2)
2039 {
2040     long len;
2041     rb_encoding *enc;
2042     char *p1, *p1end, *p2, *p2end;
2043
2044     StringValue(str2);
2045     enc = rb_enc_compatible(str1, str2);
2046     if (!enc) {
2047         return Qnil;
2048     }
2049
2050     p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
2051     p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
2052     while (p1 < p1end && p2 < p2end) {
2053         int c1 = rb_enc_codepoint(p1, p1end, enc);
2054         int c2 = rb_enc_codepoint(p2, p2end, enc);
2055
2056         if (c1 != c2) {
2057             c1 = rb_enc_toupper(c1, enc);
2058             c2 = rb_enc_toupper(c2, enc);
2059             if (c1 > c2) return INT2FIX(1);
2060             if (c1 < c2) return INT2FIX(-1);
2061         }
2062         len = rb_enc_codelen(c1, enc);
2063         p1 += len;
2064         p2 += len;
2065     }
2066     if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0);
2067     if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1);
2068     return INT2FIX(-1);
2069 }
2070
2071 static long
2072 rb_str_index(VALUE str, VALUE sub, long offset)
2073 {
2074     long pos;
2075     char *s, *sptr;
2076     long len, slen;
2077     rb_encoding *enc;
2078
2079     enc = rb_enc_check(str, sub);
2080     if (is_broken_string(sub)) {
2081         return -1;
2082     }
2083     len = str_strlen(str, enc);
2084     slen = str_strlen(sub, enc);
2085     if (offset < 0) {
2086         offset += len;
2087         if (offset < 0) return -1;
2088     }
2089     if (len - offset < slen) return -1;
2090     s = RSTRING_PTR(str);
2091     if (offset) {
2092         offset = str_offset(s, RSTRING_END(str), offset, enc, single_byte_optimizable(str));
2093         s += offset;
2094     }
2095     if (slen == 0) return offset;
2096     /* need proceed one character at a time */
2097     sptr = RSTRING_PTR(sub);
2098     slen = RSTRING_LEN(sub);
2099     len = RSTRING_LEN(str) - offset;
2100     for (;;) {
2101         char *t;
2102         pos = rb_memsearch(sptr, slen, s, len, enc);
2103         if (pos < 0) return pos;
2104         t = rb_enc_right_char_head(s, s+pos, enc);
2105         if (t == s + pos) break;
2106         if ((len -= t - s) <= 0) return -1;
2107         offset += t - s;
2108         s = t;
2109     }
2110     return pos + offset;
2111 }
2112
2113
2114 /*
2115  *  call-seq:
2116  *     str.index(substring [, offset])   => fixnum or nil
2117  *     str.index(regexp [, offset])      => fixnum or nil
2118  *
2119  *  Returns the index of the first occurrence of the given <i>substring</i> or
2120  *  pattern (<i>regexp</i>) in <i>str</i>. Returns <code>nil</code> if not
2121  *  found. If the second parameter is present, it specifies the position in the
2122  *  string to begin the search.
2123  *
2124  *     "hello".index('e')             #=> 1
2125  *     "hello".index('lo')            #=> 3
2126  *     "hello".index('a')             #=> nil
2127  *     "hello".index(?e)              #=> 1
2128  *     "hello".index(/[aeiou]/, -3)   #=> 4
2129  */
2130
2131 static VALUE
2132 rb_str_index_m(int argc, VALUE *argv, VALUE str)
2133 {
2134     VALUE sub;
2135     VALUE initpos;
2136     long pos;
2137
2138     if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
2139         pos = NUM2LONG(initpos);
2140     }
2141     else {
2142         pos = 0;
2143     }
2144     if (pos < 0) {
2145         pos += str_strlen(str, STR_ENC_GET(str));
2146         if (pos < 0) {
2147             if (TYPE(sub) == T_REGEXP) {
2148                 rb_backref_set(Qnil);
2149             }
2150             return Qnil;
2151         }
2152     }
2153
2154     switch (TYPE(sub)) {
2155       case T_REGEXP:
2156         pos = rb_reg_adjust_startpos(sub, str, pos, 0);
2157         pos = rb_reg_search(sub, str, pos, 0);
2158         pos = rb_str_sublen(str, pos);
2159         break;
2160
2161       default: {
2162         VALUE tmp;
2163
2164         tmp = rb_check_string_type(sub);
2165         if (NIL_P(tmp)) {
2166             rb_raise(rb_eTypeError, "type mismatch: %s given",
2167                      rb_obj_classname(sub));
2168         }
2169         sub = tmp;
2170       }
2171         /* fall through */
2172       case T_STRING:
2173         pos = rb_str_index(str, sub, pos);
2174         pos = rb_str_sublen(str, pos);
2175         break;
2176     }
2177
2178     if (pos == -1) return Qnil;
2179     return LONG2NUM(pos);
2180 }
2181
2182 static long
2183 rb_str_rindex(VALUE str, VALUE sub, long pos)
2184 {
2185     long len, slen;
2186     char *s, *sbeg, *e, *t;
2187     rb_encoding *enc;
2188     int singlebyte = single_byte_optimizable(str);
2189
2190     enc = rb_enc_check(str, sub);
2191     if (is_broken_string(sub)) {
2192         return -1;
2193     }
2194     len = str_strlen(str, enc);
2195     slen = str_strlen(sub, enc);
2196     /* substring longer than string */
2197     if (len < slen) return -1;
2198     if (len - pos < slen) {
2199         pos = len - slen;
2200     }
2201     if (len == 0) {
2202         return pos;
2203     }
2204     sbeg = RSTRING_PTR(str);
2205     e = RSTRING_END(str);
2206     t = RSTRING_PTR(sub);
2207     slen = RSTRING_LEN(sub);
2208     for (;;) {
2209         s = str_nth(sbeg, e, pos, enc, singlebyte);
2210         if (!s) return -1;
2211         if (memcmp(s, t, slen) == 0) {
2212             return pos;
2213         }
2214         if (pos == 0) break;
2215         pos--;
2216     }
2217     return -1;
2218 }
2219
2220
2221 /*
2222  *  call-seq:
2223  *     str.rindex(substring [, fixnum])   => fixnum or nil
2224  *     str.rindex(regexp [, fixnum])   => fixnum or nil
2225  *
2226  *  Returns the index of the last occurrence of the given <i>substring</i> or
2227  *  pattern (<i>regexp</i>) in <i>str</i>. Returns <code>nil</code> if not
2228  *  found. If the second parameter is present, it specifies the position in the
2229  *  string to end the search---characters beyond this point will not be
2230  *  considered.
2231  *
2232  *     "hello".rindex('e')             #=> 1
2233  *     "hello".rindex('l')             #=> 3
2234  *     "hello".rindex('a')             #=> nil
2235  *     "hello".rindex(?e)              #=> 1
2236  *     "hello".rindex(/[aeiou]/, -2)   #=> 1
2237  */
2238
2239 static VALUE
2240 rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
2241 {
2242     VALUE sub;
2243     VALUE vpos;
2244     rb_encoding *enc = STR_ENC_GET(str);
2245     long pos, len = str_strlen(str, enc);
2246
2247     if (rb_scan_args(argc, argv, "11", &sub, &vpos) == 2) {
2248         pos = NUM2LONG(vpos);
2249         if (pos < 0) {
2250             pos += len;
2251             if (pos < 0) {
2252                 if (TYPE(sub) == T_REGEXP) {
2253                     rb_backref_set(Qnil);
2254                 }
2255                 return Qnil;
2256             }
2257         }
2258         if (pos > len) pos = len;
2259     }
2260     else {
2261         pos = len;
2262     }
2263
2264     switch (TYPE(sub)) {
2265       case T_REGEXP:
2266         /* enc = rb_get_check(str, sub); */
2267         if (!RREGEXP(sub)->ptr || RREGEXP_SRC_LEN(sub)) {
2268             pos = rb_reg_adjust_startpos(sub, str, pos, 1);
2269             pos = rb_reg_search(sub, str, pos, 1);
2270             pos = rb_str_sublen(str, pos);
2271         }
2272         if (pos >= 0) return LONG2NUM(pos);
2273         break;
2274
2275       default: {
2276         VALUE tmp;
2277
2278         tmp = rb_check_string_type(sub);
2279         if (NIL_P(tmp)) {
2280             rb_raise(rb_eTypeError, "type mismatch: %s given",
2281                      rb_obj_classname(sub));
2282         }
2283         sub = tmp;
2284       }
2285         /* fall through */
2286       case T_STRING:
2287         pos = rb_str_rindex(str, sub, pos);
2288         if (pos >= 0) return LONG2NUM(pos);
2289         break;
2290     }
2291     return Qnil;
2292 }
2293
2294 /*
2295  *  call-seq:
2296  *     str =~ obj   => fixnum or nil
2297  *
2298  *  Match---If <i>obj</i> is a <code>Regexp</code>, use it as a pattern to match
2299  *  against <i>str</i>,and returns the position the match starts, or
2300  *  <code>nil</code> if there is no match. Otherwise, invokes
2301  *  <i>obj.=~</i>, passing <i>str</i> as an argument. The default
2302  *  <code>=~</code> in <code>Object</code> returns <code>false</code>.
2303  *
2304  *     "cat o' 9 tails" =~ /\d/   #=> 7
2305  *     "cat o' 9 tails" =~ 9      #=> nil
2306  */
2307
2308 static VALUE
2309 rb_str_match(VALUE x, VALUE y)
2310 {
2311     switch (TYPE(y)) {
2312       case T_STRING:
2313         rb_raise(rb_eTypeError, "type mismatch: String given");
2314
2315       case T_REGEXP:
2316         return rb_reg_match(y, x);
2317
2318       default:
2319         return rb_funcall(y, rb_intern("=~"), 1, x);
2320     }
2321 }
2322
2323
2324 static VALUE get_pat(VALUE, int);
2325
2326
2327 /*
2328  *  call-seq:
2329  *     str.match(pattern)   => matchdata or nil
2330  *
2331  *  Converts <i>pattern</i> to a <code>Regexp</code> (if it isn't already one),
2332  *  then invokes its <code>match</code> method on <i>str</i>.  If the second
2333  *  parameter is present, it specifies the position in the string to begin the
2334  *  search.
2335  *
2336  *     'hello'.match('(.)\1')      #=> #<MatchData "ll" 1:"l">
2337  *     'hello'.match('(.)\1')[0]   #=> "ll"
2338  *     'hello'.match(/(.)\1/)[0]   #=> "ll"
2339  *     'hello'.match('xx')         #=> nil
2340  *
2341  *  If a block is given, invoke the block with MatchData if match succeed, so
2342  *  that you can write
2343  *
2344  *     str.match(pat) {|m| ...}
2345  *
2346  *  instead of
2347  *
2348  *     if m = str.match(pat)
2349  *       ...
2350  *     end
2351  *
2352  *  The return value is a value from block execution in this case.
2353  */
2354
2355 static VALUE
2356 rb_str_match_m(int argc, VALUE *argv, VALUE str)
2357 {
2358     VALUE re, result;
2359     if (argc < 1)
2360         rb_raise(rb_eArgError, "wrong number of arguments (%d for 1)", argc);
2361     re = argv[0];
2362     argv[0] = str;
2363     result = rb_funcall2(get_pat(re, 0), rb_intern("match"), argc, argv);
2364     if (!NIL_P(result) && rb_block_given_p()) {
2365         return rb_yield(result);
2366     }
2367     return result;
2368 }
2369
2370 enum neighbor_char {
2371     NEIGHBOR_NOT_CHAR,
2372     NEIGHBOR_FOUND,
2373     NEIGHBOR_WRAPPED
2374 };
2375
2376 static enum neighbor_char
2377 enc_succ_char(char *p, int len, rb_encoding *enc)
2378 {
2379     int i, l;
2380     while (1) {
2381         for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
2382             p[i] = '\0';
2383         if (i < 0)
2384             return NEIGHBOR_WRAPPED;
2385         ++((unsigned char*)p)[i];
2386         l = rb_enc_precise_mbclen(p, p+len, enc);
2387         if (MBCLEN_CHARFOUND_P(l)) {
2388             l = MBCLEN_CHARFOUND_LEN(l);
2389             if (l == len) {
2390                 return NEIGHBOR_FOUND;
2391             }
2392             else {
2393                 memset(p+l, 0xff, len-l);
2394             }
2395         }
2396         if (MBCLEN_INVALID_P(l) && i < len-1) {
2397             int len2, l2;
2398             for (len2 = len-1; 0 < len2; len2--) {
2399                 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
2400                 if (!MBCLEN_INVALID_P(l2))
2401                     break;
2402             }
2403             memset(p+len2+1, 0xff, len-(len2+1));
2404         }
2405     }
2406 }
2407
2408 static enum neighbor_char
2409 enc_pred_char(char *p, int len, rb_encoding *enc)
2410 {
2411     int i, l;
2412     while (1) {
2413         for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
2414             p[i] = '\xff';
2415         if (i < 0)
2416             return NEIGHBOR_WRAPPED;
2417         --((unsigned char*)p)[i];
2418         l = rb_enc_precise_mbclen(p, p+len, enc);
2419         if (MBCLEN_CHARFOUND_P(l)) {
2420             l = MBCLEN_CHARFOUND_LEN(l);
2421             if (l == len) {
2422                 return NEIGHBOR_FOUND;
2423             }
2424             else {
2425                 memset(p+l, 0, len-l);
2426             }
2427         }
2428         if (MBCLEN_INVALID_P(l) && i < len-1) {
2429             int len2, l2;
2430             for (len2 = len-1; 0 < len2; len2--) {
2431                 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
2432                 if (!MBCLEN_INVALID_P(l2))
2433                     break;
2434             }
2435             memset(p+len2+1, 0, len-(len2+1));
2436         }
2437     }
2438 }
2439
2440 /*
2441   overwrite +p+ by succeeding letter in +enc+ and returns
2442   NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
2443   When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
2444   assuming each ranges are successive, and mbclen
2445   never change in each ranges.
2446   NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
2447   character.
2448  */
2449 static enum neighbor_char
2450 enc_succ_alnum_char(char *p, int len, rb_encoding *enc, char *carry)
2451 {
2452     enum neighbor_char ret;
2453     int c;
2454     int ctype;
2455     int range;
2456     char save[ONIGENC_CODE_TO_MBC_MAXLEN];
2457
2458     c = rb_enc_mbc_to_codepoint(p, p+len, enc);
2459     if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
2460         ctype = ONIGENC_CTYPE_DIGIT;
2461     else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
2462         ctype = ONIGENC_CTYPE_ALPHA;
2463     else
2464         return NEIGHBOR_NOT_CHAR;
2465
2466     MEMCPY(save, p, char, len);
2467     ret = enc_succ_char(p, len, enc);
2468     if (ret == NEIGHBOR_FOUND) {
2469         c = rb_enc_mbc_to_codepoint(p, p+len, enc);
2470         if (rb_enc_isctype(c, ctype, enc))
2471             return NEIGHBOR_FOUND;
2472     }
2473     MEMCPY(p, save, char, len);
2474     range = 1;
2475     while (1) {
2476         MEMCPY(save, p, char, len);
2477         ret = enc_pred_char(p, len, enc);
2478         if (ret == NEIGHBOR_FOUND) {
2479             c = rb_enc_mbc_to_codepoint(p, p+len, enc);
2480             if (!rb_enc_isctype(c, ctype, enc)) {
2481                 MEMCPY(p, save, char, len);
2482                 break;
2483             }
2484         }
2485         else {
2486             MEMCPY(p, save, char, len);
2487             break;
2488         }
2489         range++;
2490     }
2491     if (range == 1) {
2492         return NEIGHBOR_NOT_CHAR;
2493     }
2494
2495     if (ctype != ONIGENC_CTYPE_DIGIT) {
2496         MEMCPY(carry, p, char, len);
2497         return NEIGHBOR_WRAPPED;
2498     }
2499
2500     MEMCPY(carry, p, char, len);
2501     enc_succ_char(carry, len, enc);
2502     return NEIGHBOR_WRAPPED;
2503 }
2504
2505
2506 /*
2507  *  call-seq:
2508  *     str.succ   => new_str
2509  *     str.next   => new_str
2510  *
2511  *  Returns the successor to <i>str</i>. The successor is calculated by
2512  *  incrementing characters starting from the rightmost alphanumeric (or
2513  *  the rightmost character if there are no alphanumerics) in the
2514  *  string. Incrementing a digit always results in another digit, and
2515  *  incrementing a letter results in another letter of the same case.
2516  *  Incrementing nonalphanumerics uses the underlying character set's
2517  *  collating sequence.
2518  *
2519  *  If the increment generates a ``carry,'' the character to the left of
2520  *  it is incremented. This process repeats until there is no carry,
2521  *  adding an additional character if necessary.
2522  *
2523  *     "abcd".succ        #=> "abce"
2524  *     "THX1138".succ     #=> "THX1139"
2525  *     "<<koala>>".succ   #=> "<<koalb>>"
2526  *     "1999zzz".succ     #=> "2000aaa"
2527  *     "ZZZ9999".succ     #=> "AAAA0000"
2528  *     "***".succ         #=> "**+"
2529  */
2530
2531 VALUE
2532 rb_str_succ(VALUE orig)
2533 {
2534     rb_encoding *enc;
2535     VALUE str;
2536     char *sbeg, *s, *e, *last_alnum = 0;
2537     int c = -1;
2538     long l;
2539     char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
2540     int carry_pos = 0, carry_len = 1;
2541     enum neighbor_char neighbor = NEIGHBOR_FOUND;
2542
2543     str = rb_str_new5(orig, RSTRING_PTR(orig), RSTRING_LEN(orig));
2544     rb_enc_cr_str_copy_for_substr(str, orig);
2545     OBJ_INFECT(str, orig);
2546     if (RSTRING_LEN(str) == 0) return str;
2547
2548     enc = STR_ENC_GET(orig);
2549     sbeg = RSTRING_PTR(str);
2550     s = e = sbeg + RSTRING_LEN(str);
2551
2552     while ((s = rb_enc_prev_char(sbeg, s, enc)) != 0) {
2553         if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
2554             if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
2555                 ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
2556                 s = last_alnum;
2557                 break;
2558             }
2559         }
2560         if ((l = rb_enc_precise_mbclen(s, e, enc)) <= 0) continue;
2561         neighbor = enc_succ_alnum_char(s, l, enc, carry);
2562         switch (neighbor) {
2563           case NEIGHBOR_NOT_CHAR:
2564             continue;
2565           case NEIGHBOR_FOUND:
2566             return str;
2567           case NEIGHBOR_WRAPPED:
2568             last_alnum = s;
2569             break;
2570         }
2571         c = 1;
2572         carry_pos = s - sbeg;
2573         carry_len = l;
2574     }
2575     if (c == -1) {              /* str contains no alnum */
2576         s = e;
2577         while ((s = rb_enc_prev_char(sbeg, s, enc)) != 0) {
2578             enum neighbor_char neighbor;
2579             if ((l = rb_enc_precise_mbclen(s, e, enc)) <= 0) continue;
2580             neighbor = enc_succ_char(s, l, enc);
2581             if (neighbor == NEIGHBOR_FOUND)
2582                 return str;
2583             if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
2584                 /* wrapped to \0...\0.  search next valid char. */
2585                 enc_succ_char(s, l, enc);
2586             }
2587             if (!rb_enc_asciicompat(enc)) {
2588                 MEMCPY(carry, s, char, l);
2589                 carry_len = l;
2590             }
2591             carry_pos = s - sbeg;
2592         }
2593     }
2594     RESIZE_CAPA(str, RSTRING_LEN(str) + carry_len);
2595     s = RSTRING_PTR(str) + carry_pos;
2596     memmove(s + carry_len, s, RSTRING_LEN(str) - carry_pos);
2597     memmove(s, carry, carry_len);
2598     STR_SET_LEN(str, RSTRING_LEN(str) + carry_len);
2599     RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
2600     rb_enc_str_coderange(str);
2601     return str;
2602 }
2603
2604
2605 /*
2606  *  call-seq:
2607  *     str.succ!   => str
2608  *     str.next!   => str
2609  *
2610  *  Equivalent to <code>String#succ</code>, but modifies the receiver in
2611  *  place.
2612  */
2613
2614 static VALUE
2615 rb_str_succ_bang(VALUE str)
2616 {
2617     rb_str_shared_replace(str, rb_str_succ(str));
2618
2619     return str;
2620 }
2621
2622
2623 /*
2624  *  call-seq:
2625  *     str.upto(other_str, exclusive=false) {|s| block }   => str
2626  *
2627  *  Iterates through successive values, starting at <i>str</i> and
2628  *  ending at <i>other_str</i> inclusive, passing each value in turn to
2629  *  the block. The <code>String#succ</code> method is used to generate
2630  *  each value.  If optional second argument exclusive is omitted or is <code>false</code>,
2631  *  the last value will be included; otherwise it will be excluded.
2632  *
2633  *     "a8".upto("b6") {|s| print s, ' ' }
2634  *     for s in "a8".."b6"
2635  *       print s, ' '
2636  *     end
2637  *
2638  *  <em>produces:</em>
2639  *
2640  *     a8 a9 b0 b1 b2 b3 b4 b5 b6
2641  *     a8 a9 b0 b1 b2 b3 b4 b5 b6
2642  */
2643
2644 static VALUE
2645 rb_str_upto(int argc, VALUE *argv, VALUE beg)
2646 {
2647     VALUE end, exclusive;
2648     VALUE current, after_end;
2649     ID succ;
2650     int n, excl;
2651     rb_encoding *enc;
2652
2653     rb_scan_args(argc, argv, "11", &end, &exclusive);
2654     excl = RTEST(exclusive);
2655     CONST_ID(succ, "succ");
2656     StringValue(end);
2657     enc = rb_enc_check(beg, end);
2658     if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 &&
2659         is_ascii_string(beg) && is_ascii_string(end)) {
2660         char c = RSTRING_PTR(beg)[0];
2661         char e = RSTRING_PTR(end)[0];
2662
2663         if (c > e || (excl && c == e)) return beg;
2664         for (;;) {
2665             rb_yield(rb_enc_str_new(&c, 1, enc));
2666             if (!excl && c == e) break;
2667             c++;
2668             if (excl && c == e) break;
2669         }
2670         return beg;
2671     }
2672     n = rb_str_cmp(beg, end);
2673     if (n > 0 || (excl && n == 0)) return beg;
2674
2675     after_end = rb_funcall(end, succ, 0, 0);
2676     current = beg;
2677     while (!rb_str_equal(current, after_end)) {
2678         rb_yield(current);
2679         if (!excl && rb_str_equal(current, end)) break;
2680         current = rb_funcall(current, succ, 0, 0);
2681         StringValue(current);
2682         if (excl && rb_str_equal(current, end)) break;
2683         if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
2684             break;
2685     }
2686
2687     return beg;
2688 }
2689
2690 static VALUE
2691 rb_str_subpat(VALUE str, VALUE re, int nth)
2692 {
2693     if (rb_reg_search(re, str, 0, 0) >= 0) {
2694         return rb_reg_nth_match(nth, rb_backref_get());
2695     }
2696     return Qnil;
2697 }
2698
2699 static VALUE
2700 rb_str_aref(VALUE str, VALUE indx)
2701 {
2702     long idx;
2703
2704     switch (TYPE(indx)) {
2705       case T_FIXNUM:
2706         idx = FIX2LONG(indx);
2707
2708       num_index:
2709         str = rb_str_substr(str, idx, 1);
2710         if (!NIL_P(str) && RSTRING_LEN(str) == 0) return Qnil;
2711         return str;
2712
2713       case T_REGEXP:
2714         return rb_str_subpat(str, indx, 0);
2715
2716       case T_STRING:
2717         if (rb_str_index(str, indx, 0) != -1)
2718             return rb_str_dup(indx);
2719         return Qnil;
2720
2721       default:
2722         /* check if indx is Range */
2723         {
2724             long beg, len;
2725             VALUE tmp;
2726
2727             len = str_strlen(str, STR_ENC_GET(str));
2728             switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
2729               case Qfalse:
2730                 break;
2731               case Qnil:
2732                 return Qnil;
2733               default:
2734                 tmp = rb_str_substr(str, beg, len);
2735                 return tmp;
2736             }
2737         }
2738         idx = NUM2LONG(indx);
2739         goto num_index;
2740     }
2741     return Qnil;                /* not reached */
2742 }
2743
2744
2745 /*
2746  *  call-seq:
2747  *     str[fixnum]                 => new_str or nil
2748  *     str[fixnum, fixnum]         => new_str or nil
2749  *     str[range]                  => new_str or nil
2750  *     str[regexp]                 => new_str or nil
2751  *     str[regexp, fixnum]         => new_str or nil
2752  *     str[other_str]              => new_str or nil
2753  *     str.slice(fixnum)           => new_str or nil
2754  *     str.slice(fixnum, fixnum)   => new_str or nil
2755  *     str.slice(range)            => new_str or nil
2756  *     str.slice(regexp)           => new_str or nil
2757  *     str.slice(regexp, fixnum)   => new_str or nil
2758  *     str.slice(other_str)        => new_str or nil
2759  *
2760  *  Element Reference---If passed a single <code>Fixnum</code>, returns a
2761  *  substring of one character at that position. If passed two <code>Fixnum</code>
2762  *  objects, returns a substring starting at the offset given by the first, and
2763  *  a length given by the second. If given a range, a substring containing
2764  *  characters at offsets given by the range is returned. In all three cases, if
2765  *  an offset is negative, it is counted from the end of <i>str</i>. Returns
2766  *  <code>nil</code> if the initial offset falls outside the string, the length
2767  *  is negative, or the beginning of the range is greater than the end.
2768  *
2769  *  If a <code>Regexp</code> is supplied, the matching portion of <i>str</i> is
2770  *  returned. If a numeric parameter follows the regular expression, that
2771  *  component of the <code>MatchData</code> is returned instead. If a
2772  *  <code>String</code> is given, that string is returned if it occurs in
2773  *  <i>str</i>. In both cases, <code>nil</code> is returned if there is no
2774  *  match.
2775  *
2776  *     a = "hello there"
2777  *     a[1]                   #=> "e"
2778  *     a[1,3]                 #=> "ell"
2779  *     a[1..3]                #=> "ell"
2780  *     a[-3,2]                #=> "er"
2781  *     a[-4..-2]              #=> "her"
2782  *     a[12..-1]              #=> nil
2783  *     a[-2..-4]              #=> ""
2784  *     a[/[aeiou](.)\1/]      #=> "ell"
2785  *     a[/[aeiou](.)\1/, 0]   #=> "ell"
2786  *     a[/[aeiou](.)\1/, 1]   #=> "l"
2787  *     a[/[aeiou](.)\1/, 2]   #=> nil
2788  *     a["lo"]                #=> "lo"
2789  *     a["bye"]               #=> nil
2790  */
2791
2792 static VALUE
2793 rb_str_aref_m(int argc, VALUE *argv, VALUE str)
2794 {
2795     if (argc == 2) {
2796         if (TYPE(argv[0]) == T_REGEXP) {
2797             return rb_str_subpat(str, argv[0], NUM2INT(argv[1]));
2798         }
2799         return rb_str_substr(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]));
2800     }
2801     if (argc != 1) {
2802         rb_raise(rb_eArgError, "wrong number of arguments (%d for 1)", argc);
2803     }
2804     return rb_str_aref(str, argv[0]);
2805 }
2806
2807 static void
2808 rb_str_splice_0(VALUE str, long beg, long len, VALUE val)
2809 {
2810     rb_str_modify(str);
2811     if (len < RSTRING_LEN(val)) {
2812         /* expand string */
2813         RESIZE_CAPA(str, RSTRING_LEN(str) + RSTRING_LEN(val) - len + 1);
2814     }
2815
2816     if (RSTRING_LEN(val) != len) {
2817         memmove(RSTRING_PTR(str) + beg + RSTRING_LEN(val),
2818                 RSTRING_PTR(str) + beg + len,
2819                 RSTRING_LEN(str) - (beg + len));
2820     }
2821     if (RSTRING_LEN(val) < beg && len < 0) {
2822         MEMZERO(RSTRING_PTR(str) + RSTRING_LEN(str), char, -len);
2823     }
2824     if (RSTRING_LEN(val) > 0) {
2825         memmove(RSTRING_PTR(str)+beg, RSTRING_PTR(val), RSTRING_LEN(val));
2826     }
2827     STR_SET_LEN(str, RSTRING_LEN(str) + RSTRING_LEN(val) - len);
2828     if (RSTRING_PTR(str)) {
2829         RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
2830     }
2831     OBJ_INFECT(str, val);
2832 }
2833
2834 static void
2835 rb_str_splice(VALUE str, long beg, long len, VALUE val)
2836 {
2837     long slen;
2838     char *p, *e;
2839     rb_encoding *enc;
2840     int singlebyte = single_byte_optimizable(str);
2841
2842     if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
2843
2844     StringValue(val);
2845     rb_str_modify(str);
2846     enc = rb_enc_check(str, val);
2847     slen = str_strlen(str, enc);
2848
2849     if (slen < beg) {
2850       out_of_range:
2851         rb_raise(rb_eIndexError, "index %ld out of string", beg);
2852     }
2853     if (beg < 0) {
2854         if (-beg > slen) {
2855             goto out_of_range;
2856         }
2857         beg += slen;
2858     }
2859     if (slen < len || slen < beg + len) {
2860         len = slen - beg;
2861     }
2862     p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
2863     if (!p) p = RSTRING_END(str);
2864     e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
2865     if (!e) e = RSTRING_END(str);
2866     /* error check */
2867     beg = p - RSTRING_PTR(str); /* physical position */
2868     len = e - p;                /* physical length */
2869     rb_str_splice_0(str, beg, len, val);
2870     rb_enc_associate(str, enc);
2871 }
2872
2873 void
2874 rb_str_update(VALUE str, long beg, long len, VALUE val)
2875 {
2876     rb_str_splice(str, beg, len, val);
2877 }
2878
2879 static void
2880 rb_str_subpat_set(VALUE str, VALUE re, int nth, VALUE val)
2881 {
2882     VALUE match;
2883     long start, end, len;
2884     rb_encoding *enc;
2885     struct re_registers *regs;
2886
2887     if (rb_reg_search(re, str, 0, 0) < 0) {
2888         rb_raise(rb_eIndexError, "regexp not matched");
2889     }
2890     match = rb_backref_get();
2891     regs = RMATCH_REGS(match);
2892     if (nth >= regs->num_regs) {
2893       out_of_range:
2894         rb_raise(rb_eIndexError, "index %d out of regexp", nth);
2895     }
2896     if (nth < 0) {
2897         if (-nth >= regs->num_regs) {
2898             goto out_of_range;
2899         }
2900         nth += regs->num_regs;
2901     }
2902
2903     start = BEG(nth);
2904     if (start == -1) {
2905         rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
2906     }
2907     end = END(nth);
2908     len = end - start;
2909     StringValue(val);
2910     enc = rb_enc_check(str, val);
2911     rb_str_splice_0(str, start, len, val);
2912     rb_enc_associate(str, enc);
2913 }
2914
2915 static VALUE
2916 rb_str_aset(VALUE str, VALUE indx, VALUE val)
2917 {
2918     long idx, beg;
2919
2920     switch (TYPE(indx)) {
2921       case T_FIXNUM:
2922         idx = FIX2LONG(indx);
2923       num_index:
2924         rb_str_splice(str, idx, 1, val);
2925         return val;
2926
2927       case T_REGEXP:
2928         rb_str_subpat_set(str, indx, 0, val);
2929         return val;
2930
2931       case T_STRING:
2932         beg = rb_str_index(str, indx, 0);
2933         if (beg < 0) {
2934             rb_raise(rb_eIndexError, "string not matched");
2935         }
2936         beg = rb_str_sublen(str, beg);
2937         rb_str_splice(str, beg, str_strlen(indx, 0), val);
2938         return val;
2939
2940       default:
2941         /* check if indx is Range */
2942         {
2943             long beg, len;
2944             if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, 0), 2)) {
2945                 rb_str_splice(str, beg, len, val);
2946                 return val;
2947             }
2948         }
2949         idx = NUM2LONG(indx);
2950         goto num_index;
2951     }
2952 }
2953
2954 /*
2955  *  call-seq:
2956  *     str[fixnum] = new_str
2957  *     str[fixnum, fixnum] = new_str
2958  *     str[range] = aString
2959  *     str[regexp] = new_str
2960  *     str[regexp, fixnum] = new_str
2961  *     str[other_str] = new_str
2962  *
2963  *  Element Assignment---Replaces some or all of the content of <i>str</i>. The
2964  *  portion of the string affected is determined using the same criteria as
2965  *  <code>String#[]</code>. If the replacement string is not the same length as
2966  *  the text it is replacing, the string will be adjusted accordingly. If the
2967  *  regular expression or string is used as the index doesn't match a position
2968  *  in the string, <code>IndexError</code> is raised. If the regular expression
2969  *  form is used, the optional second <code>Fixnum</code> allows you to specify
2970  *  which portion of the match to replace (effectively using the
2971  *  <code>MatchData</code> indexing rules. The forms that take a
2972  *  <code>Fixnum</code> will raise an <code>IndexError</code> if the value is
2973  *  out of range; the <code>Range</code> form will raise a
2974  *  <code>RangeError</code>, and the <code>Regexp</code> and <code>String</code>
2975  *  forms will silently ignore the assignment.
2976  */
2977
2978 static VALUE
2979 rb_str_aset_m(int argc, VALUE *argv, VALUE str)
2980 {
2981     if (argc == 3) {
2982         if (TYPE(argv[0]) == T_REGEXP) {
2983             rb_str_subpat_set(str, argv[0], NUM2INT(argv[1]), argv[2]);
2984         }
2985         else {
2986             rb_str_splice(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
2987         }
2988         return argv[2];
2989     }
2990     if (argc != 2) {
2991         rb_raise(rb_eArgError, "wrong number of arguments (%d for 2)", argc);
2992     }
2993     return rb_str_aset(str, argv[0], argv[1]);
2994 }
2995
2996 /*
2997  *  call-seq:
2998  *     str.insert(index, other_str)   => str
2999  *
3000  *  Inserts <i>other_str</i> before the character at the given
3001  *  <i>index</i>, modifying <i>str</i>. Negative indices count from the
3002  *  end of the string, and insert <em>after</em> the given character.
3003  *  The intent is insert <i>aString</i> so that it starts at the given
3004  *  <i>index</i>.
3005  *
3006  *     "abcd".insert(0, 'X')    #=> "Xabcd"
3007  *     "abcd".insert(3, 'X')    #=> "abcXd"
3008  *     "abcd".insert(4, 'X')    #=> "abcdX"
3009  *     "abcd".insert(-3, 'X')   #=> "abXcd"
3010  *     "abcd".insert(-1, 'X')   #=> "abcdX"
3011  */
3012
3013 static VALUE
3014 rb_str_insert(VALUE str, VALUE idx, VALUE str2)
3015 {
3016     long pos = NUM2LONG(idx);
3017
3018     if (pos == -1) {
3019         return rb_str_append(str, str2);
3020     }
3021     else if (pos < 0) {
3022         pos++;
3023     }
3024     rb_str_splice(str, pos, 0, str2);
3025     return str;
3026 }
3027
3028
3029 /*
3030  *  call-seq:
3031  *     str.slice!(fixnum)           => fixnum or nil
3032  *     str.slice!(fixnum, fixnum)   => new_str or nil
3033  *     str.slice!(range)            => new_str or nil
3034  *     str.slice!(regexp)           => new_str or nil
3035  *     str.slice!(other_str)        => new_str or nil
3036  *
3037  *  Deletes the specified portion from <i>str</i>, and returns the portion
3038  *  deleted.
3039  *
3040  *     string = "this is a string"
3041  *     string.slice!(2)        #=> "i"
3042  *     string.slice!(3..6)     #=> " is "
3043  *     string.slice!(/s.*t/)   #=> "sa st"
3044  *     string.slice!("r")      #=> "r"
3045  *     string                  #=> "thing"
3046  */
3047
3048 static VALUE
3049 rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
3050 {
3051     VALUE result;
3052     VALUE buf[3];
3053     int i;
3054
3055     if (argc < 1 || 2 < argc) {
3056         rb_raise(rb_eArgError, "wrong number of arguments (%d for 1)", argc);
3057     }
3058     for (i=0; i<argc; i++) {
3059         buf[i] = argv[i];
3060     }
3061     rb_str_modify(str);
3062     buf[i] = rb_str_new(0,0);
3063     result = rb_str_aref_m(argc, buf, str);
3064     if (!NIL_P(result)) {
3065         rb_str_aset_m(argc+1, buf, str);
3066     }
3067     return result;
3068 }
3069
3070 static VALUE
3071 get_pat(VALUE pat, int quote)
3072 {
3073     VALUE val;
3074
3075     switch (TYPE(pat)) {
3076       case T_REGEXP:
3077         return pat;
3078
3079       case T_STRING:
3080         break;
3081
3082       default:
3083         val = rb_check_string_type(pat);
3084         if (NIL_P(val)) {
3085             Check_Type(pat, T_REGEXP);
3086         }
3087         pat = val;
3088     }
3089
3090     if (quote) {
3091         pat = rb_reg_quote(pat);
3092     }
3093
3094     return rb_reg_regcomp(pat);
3095 }
3096
3097
3098 /*
3099  *  call-seq:
3100  *     str.sub!(pattern, replacement)          => str or nil
3101  *     str.sub!(pattern) {|match| block }      => str or nil
3102  *
3103  *  Performs the substitutions of <code>String#sub</code> in place,
3104  *  returning <i>str</i>, or <code>nil</code> if no substitutions were
3105  *  performed.
3106  */
3107
3108 static VALUE
3109 rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
3110 {
3111     VALUE pat, repl, hash = Qnil;
3112     int iter = 0;
3113     int tainted = 0;
3114     long plen;
3115
3116     if (argc == 1 && rb_block_given_p()) {
3117         iter = 1;
3118     }
3119     else if (argc == 2) {
3120         repl = argv[1];
3121         hash = rb_check_convert_type(argv[1], T_HASH, "Hash", "to_hash");
3122         if (NIL_P(hash)) {
3123             StringValue(repl);
3124         }
3125         if (OBJ_TAINTED(repl)) tainted = 1;
3126     }
3127     else {
3128         rb_raise(rb_eArgError, "wrong number of arguments (%d for 2)", argc);
3129     }
3130
3131     pat = get_pat(argv[0], 1);
3132     if (rb_reg_search(pat, str, 0, 0) >= 0) {
3133         rb_encoding *enc;
3134         int cr = ENC_CODERANGE(str);
3135         VALUE match = rb_backref_get();
3136         struct re_registers *regs = RMATCH_REGS(match);
3137         long beg0 = BEG(0);
3138         long end0 = END(0);
3139
3140         if (iter || !NIL_P(hash)) {
3141             char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
3142
3143             if (iter) {
3144                 repl = rb_obj_as_string(rb_yield(rb_reg_nth_match(0, match)));
3145             }
3146             else {
3147                 repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
3148                 repl = rb_obj_as_string(repl);
3149             }
3150             str_mod_check(str, p, len);
3151             str_frozen_check(str);
3152         }
3153         else {
3154             repl = rb_reg_regsub(repl, str, regs, pat);
3155         }
3156         enc = rb_enc_compatible(str, repl);
3157         if (!enc) {
3158             rb_encoding *str_enc = STR_ENC_GET(str);
3159             if (coderange_scan(RSTRING_PTR(str), beg0, str_enc) != ENC_CODERANGE_7BIT ||
3160                 coderange_scan(RSTRING_PTR(str)+end0,
3161                                RSTRING_LEN(str)-end0, str_enc) != ENC_CODERANGE_7BIT) {
3162                 rb_raise(rb_eArgError, "character encodings differ: %s and %s",
3163                          rb_enc_name(str_enc),
3164                          rb_enc_name(STR_ENC_GET(repl)));
3165             }
3166             enc = STR_ENC_GET(repl);
3167         }
3168         rb_str_modify(str);
3169         rb_enc_associate(str, enc);
3170         if (OBJ_TAINTED(repl)) tainted = 1;
3171         if (ENC_CODERANGE_UNKNOWN < cr && cr < ENC_CODERANGE_BROKEN) {
3172             int cr2 = ENC_CODERANGE(repl);
3173             if (cr2 == ENC_CODERANGE_UNKNOWN || cr2 > cr) cr = cr2;
3174         }
3175         plen = end0 - beg0;
3176         if (RSTRING_LEN(repl) > plen) {
3177             RESIZE_CAPA(str, RSTRING_LEN(str) + RSTRING_LEN(repl) - plen);
3178         }
3179         if (RSTRING_LEN(repl) != plen) {
3180             memmove(RSTRING_PTR(str) + beg0 + RSTRING_LEN(repl),
3181                     RSTRING_PTR(str) + beg0 + plen,
3182                     RSTRING_LEN(str) - beg0 - plen);
3183         }
3184         memcpy(RSTRING_PTR(str) + beg0,
3185                RSTRING_PTR(repl), RSTRING_LEN(repl));
3186         STR_SET_LEN(str, RSTRING_LEN(str) + RSTRING_LEN(repl) - plen);
3187         RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
3188         ENC_CODERANGE_SET(str, cr);
3189         if (tainted) OBJ_TAINT(str);
3190
3191         return str;
3192     }
3193     return Qnil;
3194 }
3195
3196
3197 /*
3198  *  call-seq:
3199  *     str.sub(pattern, replacement)         => new_str
3200  *     str.sub(pattern) {|match| block }     => new_str
3201  *
3202  *  Returns a copy of <i>str</i> with the <em>first</em> occurrence of
3203  *  <i>pattern</i> replaced with either <i>replacement</i> or the value of the
3204  *  block. The <i>pattern</i> will typically be a <code>Regexp</code>; if it is
3205  *  a <code>String</code> then no regular expression metacharacters will be
3206  *  interpreted (that is <code>/\d/</code> will match a digit, but
3207  *  <code>'\d'</code> will match a backslash followed by a 'd').
3208  *
3209  *  If the method call specifies <i>replacement</i>, special variables such as
3210  *  <code>$&</code> will not be useful, as substitution into the string occurs
3211  *  before the pattern match starts. However, the sequences <code>\1</code>,
3212  *  <code>\2</code>, <code>\k<group_name></code>, etc., may be used.
3213  *
3214  *  In the block form, the current match string is passed in as a parameter, and
3215  *  variables such as <code>$1</code>, <code>$2</code>, <code>$`</code>,
3216  *  <code>$&</code>, and <code>$'</code> will be set appropriately. The value
3217  *  returned by the block will be substituted for the match on each call.
3218  *
3219  *  The result inherits any tainting in the original string or any supplied
3220  *  replacement string.
3221  *
3222  *     "hello".sub(/[aeiou]/, '*')                  #=> "h*llo"
3223  *     "hello".sub(/([aeiou])/, '<\1>')             #=> "h<e>llo"
3224  *     "hello".sub(/./) {|s| s[0].ord.to_s + ' ' }  #=> "104 ello"
3225  *     "hello".sub(/(?<foo>[aeiou])/, '*\k<foo>*')  #=> "h*e*llo"
3226  */
3227
3228 static VALUE
3229 rb_str_sub(int argc, VALUE *argv, VALUE str)
3230 {
3231     str = rb_str_dup(str);
3232     rb_str_sub_bang(argc, argv, str);
3233     return str;
3234 }
3235
3236 static VALUE
3237 str_gsub(int argc, VALUE *argv, VALUE str, int bang)
3238 {
3239     VALUE pat, val, repl, match, dest, hash = Qnil;
3240     struct re_registers *regs;
3241     long beg, n;
3242     long beg0, end0;
3243     long offset, blen, slen, len, last;
3244     int iter = 0;
3245     char *sp, *cp;
3246     int tainted = 0;
3247     rb_encoding *str_enc;
3248
3249     switch (argc) {
3250       case 1:
3251         RETURN_ENUMERATOR(str, argc, argv);
3252         iter = 1;
3253         break;
3254       case 2:
3255         repl = argv[1];
3256         hash = rb_check_convert_type(argv[1], T_HASH, "Hash", "to_hash");
3257         if (NIL_P(hash)) {
3258             StringValue(repl);
3259         }
3260         if (OBJ_TAINTED(repl)) tainted = 1;
3261         break;
3262       default:
3263         rb_raise(rb_eArgError, "wrong number of arguments (%d for 2)", argc);
3264     }
3265
3266     pat = get_pat(argv[0], 1);
3267     beg = rb_reg_search(pat, str, 0, 0);
3268     if (beg < 0) {
3269         if (bang) return Qnil;  /* no match, no substitution */
3270         return rb_str_dup(str);
3271     }
3272
3273     offset = 0;
3274     n = 0;
3275     blen = RSTRING_LEN(str) + 30; /* len + margin */
3276     dest = rb_str_buf_new(blen);
3277     sp = RSTRING_PTR(str);
3278     slen = RSTRING_LEN(str);
3279     cp = sp;
3280     str_enc = STR_ENC_GET(str);
3281
3282     do {
3283         n++;
3284         match = rb_backref_get();
3285         regs = RMATCH_REGS(match);
3286         beg0 = BEG(0);
3287         end0 = END(0);
3288         if (iter || !NIL_P(hash)) {
3289             if (iter) {
3290                 val = rb_obj_as_string(rb_yield(rb_reg_nth_match(0, match)));
3291             }
3292             else {
3293                 val = rb_hash_aref(hash, rb_str_subseq(str, BEG(0), END(0) - BEG(0)));
3294                 val = rb_obj_as_string(val);
3295             }
3296             str_mod_check(str, sp, slen);
3297             if (bang) str_frozen_check(str);
3298             if (val == dest) {  /* paranoid check [ruby-dev:24827] */
3299                 rb_raise(rb_eRuntimeError, "block should not cheat");
3300             }
3301         }
3302         else {
3303             val = rb_reg_regsub(repl, str, regs, pat);
3304         }
3305
3306         if (OBJ_TAINTED(val)) tainted = 1;
3307
3308         len = beg - offset;     /* copy pre-match substr */
3309         if (len) {
3310             rb_enc_str_buf_cat(dest, cp, len, str_enc);
3311         }
3312
3313         rb_str_buf_append(dest, val);
3314
3315         last = offset;
3316         offset = end0;
3317         if (beg0 == end0) {
3318             /*
3319              * Always consume at least one character of the input string
3320              * in order to prevent infinite loops.
3321              */
3322             if (RSTRING_LEN(str) <= end0) break;
3323             len = rb_enc_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
3324             rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
3325             offset = end0 + len;
3326         }
3327         cp = RSTRING_PTR(str) + offset;
3328         if (offset > RSTRING_LEN(str)) break;
3329         beg = rb_reg_search(pat, str, offset, 0);
3330     } while (beg >= 0);
3331     if (RSTRING_LEN(str) > offset) {
3332         rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
3333     }
3334     rb_reg_search(pat, str, last, 0);
3335     if (bang) {
3336         rb_str_shared_replace(str, dest);
3337     }
3338     else {
3339         RBASIC(dest)->klass = rb_obj_class(str);
3340         OBJ_INFECT(dest, str);
3341         str = dest;
3342     }
3343
3344     if (tainted) OBJ_TAINT(str);
3345     return str;
3346 }
3347
3348
3349 /*
3350  *  call-seq:
3351  *     str.gsub!(pattern, replacement)        => str or nil
3352  *     str.gsub!(pattern) {|match| block }    => str or nil
3353  *
3354  *  Performs the substitutions of <code>String#gsub</code> in place, returning
3355  *  <i>str</i>, or <code>nil</code> if no substitutions were performed.
3356  */
3357
3358 static VALUE
3359 rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
3360 {
3361     return str_gsub(argc, argv, str, 1);
3362 }
3363
3364
3365 /*
3366  *  call-seq:
3367  *     str.gsub(pattern, replacement)       => new_str
3368  *     str.gsub(pattern) {|match| block }   => new_str
3369  *
3370  *  Returns a copy of <i>str</i> with <em>all</em> occurrences of <i>pattern</i>
3371  *  replaced with either <i>replacement</i> or the value of the block. The
3372  *  <i>pattern</i> will typically be a <code>Regexp</code>; if it is a
3373  *  <code>String</code> then no regular expression metacharacters will be
3374  *  interpreted (that is <code>/\d/</code> will match a digit, but
3375  *  <code>'\d'</code> will match a backslash followed by a 'd').
3376  *
3377  *  If a string is used as the replacement, special variables from the match
3378  *  (such as <code>$&</code> and <code>$1</code>) cannot be substituted into it,
3379  *  as substitution into the string occurs before the pattern match
3380  *  starts. However, the sequences <code>\1</code>, <code>\2</code>,
3381  *  <code>\k<group_name></code>, and so on may be used to interpolate
3382  *  successive groups in the match.
3383  *
3384  *  In the block form, the current match string is passed in as a parameter, and
3385  *  variables such as <code>$1</code>, <code>$2</code>, <code>$`</code>,
3386  *  <code>$&</code>, and <code>$'</code> will be set appropriately. The value
3387  *  returned by the block will be substituted for the match on each call.
3388  *
3389  *  The result inherits any tainting in the original string or any supplied
3390  *  replacement string.
3391  *
3392  *     "hello".gsub(/[aeiou]/, '*')                  #=> "h*ll*"
3393  *     "hello".gsub(/([aeiou])/, '<\1>')             #=> "h<e>ll<o>"
3394  *     "hello".gsub(/./) {|s| s[0].ord.to_s + ' '}   #=> "104 101 108 108 111 "
3395  *     "hello".gsub(/(?<foo>[aeiou])/, '{\k<foo>}')  #=> "h{e}ll{o}"
3396  */
3397
3398 static VALUE
3399 rb_str_gsub(int argc, VALUE *argv, VALUE str)
3400 {
3401     return str_gsub(argc, argv, str, 0);
3402 }
3403
3404
3405 /*
3406  *  call-seq:
3407  *     str.replace(other_str)   => str
3408  *
3409  *  Replaces the contents and taintedness of <i>str</i> with the corresponding
3410  *  values in <i>other_str</i>.
3411  *
3412  *     s = "hello"         #=> "hello"
3413  *     s.replace "world"   #=> "world"
3414  */
3415
3416 static VALUE
3417 rb_str_replace(VALUE str, VALUE str2)
3418 {
3419     long len;
3420     if (str == str2) return str;
3421
3422     StringValue(str2);
3423     len = RSTRING_LEN(str2);
3424     if (STR_ASSOC_P(str2)) {
3425         str2 = rb_str_new4(str2);
3426     }
3427     if (str_independent(str) && !STR_EMBED_P(str)) {
3428         xfree(RSTRING_PTR(str));
3429     }
3430     if (STR_SHARED_P(str2)) {
3431         STR_SET_NOEMBED(str);
3432         RSTRING(str)->as.heap.len = len;
3433         RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
3434         FL_SET(str, ELTS_SHARED);
3435         FL_UNSET(str, STR_ASSOC);
3436         RSTRING(str)->as.heap.aux.shared = RSTRING(str2)->as.heap.aux.shared;
3437     }
3438     else {
3439         str_replace_shared(str, rb_str_new4(str2));
3440     }
3441
3442     OBJ_INFECT(str, str2);
3443     rb_enc_cr_str_exact_copy(str, str2);
3444     return str;
3445 }
3446
3447 /*
3448  *  call-seq:
3449  *     string.clear    ->  string
3450  *
3451  *  Makes string empty.
3452  *
3453  *     a = "abcde"
3454  *     a.clear    #=> ""
3455  */
3456
3457 static VALUE
3458 rb_str_clear(VALUE str)
3459 {
3460     /* rb_str_modify() */       /* no need for str_make_independent */
3461     if (str_independent(str) && !STR_EMBED_P(str)) {
3462         xfree(RSTRING_PTR(str));
3463     }
3464     STR_SET_EMBED(str);
3465     STR_SET_EMBED_LEN(str, 0);
3466     RSTRING_PTR(str)[0] = 0;
3467     ENC_CODERANGE_CLEAR(str);
3468     return str;
3469 }
3470
3471 /*
3472  *  call-seq:
3473  *     string.chr    ->  string
3474  *
3475  *  Returns a one-character string at the beginning of the string.
3476  *
3477  *     a = "abcde"
3478  *     a.chr    #=> "a"
3479  */
3480
3481 static VALUE
3482 rb_str_chr(VALUE str)
3483 {
3484     return rb_str_substr(str, 0, 1);
3485 }
3486
3487 /*
3488  *  call-seq:
3489  *     str.getbyte(index)          => 0 .. 255
3490  *
3491  *  returns the <i>index</i>th byte as an integer.
3492  */
3493 static VALUE
3494 rb_str_getbyte(VALUE str, VALUE index)
3495 {
3496     long pos = NUM2LONG(index);
3497
3498     if (pos < 0)
3499         pos += RSTRING_LEN(str);
3500     if (pos < 0 ||  RSTRING_LEN(str) <= pos)
3501         return Qnil;
3502
3503     return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
3504 }
3505
3506 /*
3507  *  call-seq:
3508  *     str.setbyte(index, int) => int
3509  *
3510  *  modifies the <i>index</i>th byte as <i>int</i>.
3511  */
3512 static VALUE
3513 rb_str_setbyte(VALUE str, VALUE index, VALUE value)
3514 {
3515     long pos = NUM2LONG(index);
3516     int byte = NUM2INT(value);
3517
3518     rb_str_modify(str);
3519
3520     if (pos < -RSTRING_LEN(str) || RSTRING_LEN(str) <= pos)
3521         rb_raise(rb_eIndexError, "index %ld out of string", pos);
3522     if (pos < 0)
3523         pos += RSTRING_LEN(str);
3524
3525     RSTRING_PTR(str)[pos] = byte;
3526
3527     return value;
3528 }
3529
3530 /*
3531  *  call-seq:
3532  *     str.reverse   => new_str
3533  *
3534  *  Returns a new string with the characters from <i>str</i> in reverse order.
3535  *
3536  *     "stressed".reverse   #=> "desserts"
3537  */
3538
3539 static VALUE
3540 rb_str_reverse(VALUE str)
3541 {
3542     rb_encoding *enc;
3543     VALUE rev;
3544     char *s, *e, *p;
3545     int single = 1;
3546
3547     if (RSTRING_LEN(str) <= 1) return rb_str_dup(str);
3548     enc = STR_ENC_GET(str);
3549     rev = rb_str_new5(str, 0, RSTRING_LEN(str));
3550     s = RSTRING_PTR(str); e = RSTRING_END(str);
3551     p = RSTRING_END(rev);
3552
3553     if (RSTRING_LEN(str) > 1) {
3554         if (single_byte_optimizable(str)) {
3555             while (s < e) {
3556                 *--p = *s++;
3557             }
3558         }
3559         else {
3560             while (s < e) {
3561                 int clen = rb_enc_mbclen(s, e, enc);
3562
3563                 if (clen > 1 || (*s & 0x80)) single = 0;
3564                 p -= clen;
3565                 memcpy(p, s, clen);
3566                 s += clen;
3567             }
3568         }
3569     }
3570     STR_SET_LEN(rev, RSTRING_LEN(str));
3571     OBJ_INFECT(rev, str);
3572     if (ENC_CODERANGE(str) == ENC_CODERANGE_UNKNOWN) {
3573         if (single) {
3574             ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
3575         }
3576         else {
3577             ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
3578         }
3579     }
3580     rb_enc_cr_str_copy_for_substr(rev, str);
3581
3582     return rev;
3583 }
3584
3585
3586 /*
3587  *  call-seq:
3588  *     str.reverse!   => str
3589  *
3590  *  Reverses <i>str</i> in place.
3591  */
3592
3593 static VALUE
3594 rb_str_reverse_bang(VALUE str)
3595 {
3596     if (RSTRING_LEN(str) > 1) {
3597         if (single_byte_optimizable(str)) {
3598             char *s, *e, c;
3599             int cr = ENC_CODERANGE(str);
3600             int single = 1;
3601
3602             rb_str_modify(str);
3603             s = RSTRING_PTR(str);
3604             e = RSTRING_END(str) - 1;
3605             while (s < e) {
3606                 c = *s;
3607                 if (*s & 0x80) single = 0;
3608                 *s++ = *e;
3609                 *e-- = c;
3610             }
3611             if (cr == ENC_CODERANGE_UNKNOWN && single) {
3612                 cr = ENC_CODERANGE_7BIT;
3613             }
3614             ENC_CODERANGE_SET(str, cr);
3615         }
3616         else {
3617             rb_str_shared_replace(str, rb_str_reverse(str));
3618         }
3619     }
3620     return str;
3621 }
3622
3623
3624 /*
3625  *  call-seq:
3626  *     str.include? other_str   => true or false
3627  *
3628  *  Returns <code>true</code> if <i>str</i> contains the given string or
3629  *  character.
3630  *
3631  *     "hello".include? "lo"   #=> true
3632  *     "hello".include? "ol"   #=> false
3633  *     "hello".include? ?h     #=> true
3634  */
3635
3636 static VALUE
3637 rb_str_include(VALUE str, VALUE arg)
3638 {
3639     long i;
3640
3641     StringValue(arg);
3642     i = rb_str_index(str, arg, 0);
3643
3644     if (i == -1) return Qfalse;
3645     return Qtrue;
3646 }
3647
3648
3649 /*
3650  *  call-seq:
3651  *     str.to_i(base=10)   => integer
3652  *
3653  *  Returns the result of interpreting leading characters in <i>str</i> as an
3654  *  integer base <i>base</i> (between 2 and 36). Extraneous characters past the
3655  *  end of a valid number are ignored. If there is not a valid number at the
3656  *  start of <i>str</i>, <code>0</code> is returned. This method never raises an
3657  *  exception.
3658  *
3659  *     "12345".to_i             #=> 12345
3660  *     "99 red balloons".to_i   #=> 99
3661  *     "0a".to_i                #=> 0
3662  *     "0a".to_i(16)            #=> 10
3663  *     "hello".to_i             #=> 0
3664  *     "1100101".to_i(2)        #=> 101
3665  *     "1100101".to_i(8)        #=> 294977
3666  *     "1100101".to_i(10)       #=> 1100101
3667  *     "1100101".to_i(16)       #=> 17826049
3668  */
3669
3670 static VALUE
3671 rb_str_to_i(int argc, VALUE *argv, VALUE str)
3672 {
3673     int base;
3674
3675     if (argc == 0) base = 10;
3676     else {
3677         VALUE b;
3678
3679         rb_scan_args(argc, argv, "01", &b);
3680         base = NUM2INT(b);
3681     }
3682     if (base < 0) {
3683         rb_raise(rb_eArgError, "invalid radix %d", base);
3684     }
3685     return rb_str_to_inum(str, base, Qfalse);
3686 }
3687
3688
3689 /*
3690  *  call-seq:
3691  *     str.to_f   => float
3692  *
3693  *  Returns the result of interpreting leading characters in <i>str</i> as a
3694  *  floating point number. Extraneous characters past the end of a valid number
3695  *  are ignored. If there is not a valid number at the start of <i>str</i>,
3696  *  <code>0.0</code> is returned. This method never raises an exception.
3697  *
3698  *     "123.45e1".to_f        #=> 1234.5
3699  *     "45.67 degrees".to_f   #=> 45.67
3700  *     "thx1138".to_f         #=> 0.0
3701  */
3702
3703 static VALUE
3704 rb_str_to_f(VALUE str)
3705 {
3706     return DOUBLE2NUM(rb_str_to_dbl(str, Qfalse));
3707 }
3708
3709
3710 /*
3711  *  call-seq:
3712  *     str.to_s     => str
3713  *     str.to_str   => str
3714  *
3715  *  Returns the receiver.
3716  */
3717
3718 static VALUE
3719 rb_str_to_s(VALUE str)
3720 {
3721     if (rb_obj_class(str) != rb_cString) {
3722         VALUE dup = str_alloc(rb_cString);
3723         rb_str_replace(dup, str);
3724         return dup;
3725     }
3726     return str;
3727 }
3728
3729 static void
3730 str_cat_char(VALUE str, int c, rb_encoding *enc)
3731 {
3732     char s[16];
3733     int n = rb_enc_codelen(c, enc);
3734
3735     rb_enc_mbcput(c, s, enc);
3736     rb_enc_str_buf_cat(str, s, n, enc);
3737 }
3738
3739 static void
3740 prefix_escape(VALUE str, int c, rb_encoding *enc)
3741 {
3742     str_cat_char(str, '\\', enc);
3743     str_cat_char(str, c, enc);
3744 }
3745
3746 /*
3747  * call-seq:
3748  *   str.inspect   => string
3749  *
3750  * Returns a printable version of _str_, surrounded by quote marks,
3751  * with special characters escaped.
3752  *
3753  *    str = "hello"
3754  *    str[3] = "\b"
3755  *    str.inspect       #=> "\"hel\\bo\""
3756  */
3757
3758 VALUE
3759 rb_str_inspect(VALUE str)
3760 {
3761     rb_encoding *enc = STR_ENC_GET(str);
3762     char *p, *pend;
3763     VALUE result = rb_str_buf_new(0);
3764
3765     if (!rb_enc_asciicompat(enc)) enc = rb_usascii_encoding();
3766     rb_enc_associate(result, enc);
3767     str_cat_char(result, '"', enc);
3768     p = RSTRING_PTR(str); pend = RSTRING_END(str);
3769     while (p < pend) {
3770         int c;
3771         int n;
3772         int cc;
3773
3774         n = rb_enc_precise_mbclen(p, pend, enc);
3775         if (!MBCLEN_CHARFOUND_P(n)) {
3776             p++;
3777             n = 1;
3778             goto escape_codepoint;
3779         }
3780         n = MBCLEN_CHARFOUND_LEN(n);
3781
3782         c = rb_enc_codepoint(p, pend, enc);
3783         n = rb_enc_codelen(c, enc);
3784
3785         p += n;
3786         if (c == '"'|| c == '\\' ||
3787             (c == '#' &&
3788              p < pend &&
3789              MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p,pend,enc)) &&
3790              (cc = rb_enc_codepoint(p,pend,enc),
3791               (cc == '$' || cc == '@' || cc == '{')))) {
3792             prefix_escape(result, c, enc);
3793         }
3794         else if (c == '\n') {
3795             prefix_escape(result, 'n', enc);
3796         }
3797         else if (c == '\r') {
3798             prefix_escape(result, 'r', enc);
3799         }
3800         else if (c == '\t') {
3801             prefix_escape(result, 't', enc);
3802         }
3803         else if (c == '\f') {
3804             prefix_escape(result, 'f', enc);
3805         }
3806         else if (c == '\013') {
3807             prefix_escape(result, 'v', enc);
3808         }
3809         else if (c == '\010') {
3810             prefix_escape(result, 'b', enc);
3811         }
3812         else if (c == '\007') {
3813             prefix_escape(result, 'a', enc);
3814         }
3815         else if (c == 033) {
3816             prefix_escape(result, 'e', enc);
3817         }
3818         else if (rb_enc_isprint(c, enc)) {
3819             rb_enc_str_buf_cat(result, p-n, n, enc);
3820         }
3821         else {
3822             char buf[5];
3823             char *s;
3824             char *q;
3825
3826           escape_codepoint:
3827             for (q = p-n; q < p; q++) {
3828                 s = buf;
3829                 sprintf(buf, "\\x%02X", *q & 0377);
3830                 while (*s) {
3831                     str_cat_char(result, *s++, enc);
3832                 }
3833             }
3834         }
3835     }
3836     str_cat_char(result, '"', enc);
3837
3838     OBJ_INFECT(result, str);
3839     return result;
3840 }
3841
3842 #define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
3843
3844 /*
3845  *  call-seq:
3846  *     str.dump   => new_str
3847  *
3848  *  Produces a version of <i>str</i> with all nonprinting characters replaced by
3849  *  <code>\nnn</code> notation and all special characters escaped.
3850  */
3851
3852 VALUE
3853 rb_str_dump(VALUE str)
3854 {
3855     rb_encoding *enc0 = rb_enc_get(str);
3856     long len;
3857     const char *p, *pend;
3858     char *q, *qend;
3859     VALUE result;
3860
3861     len = 2;                    /* "" */
3862     p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
3863     while (p < pend) {
3864         unsigned char c = *p++;
3865         switch (c) {
3866           case '"':  case '\\':
3867           case '\n': case '\r':
3868           case '\t': case '\f':
3869           case '\013': case '\010': case '\007': case '\033':
3870             len += 2;
3871             break;
3872
3873           case '#':
3874             len += IS_EVSTR(p, pend) ? 2 : 1;
3875             break;
3876
3877           default:
3878             if (ISPRINT(c)) {
3879                 len++;
3880             }
3881             else {
3882                 len += 4;               /* \xNN */
3883             }
3884             break;
3885         }
3886     }
3887     if (!rb_enc_asciicompat(enc0)) {
3888         len += 19;              /* ".force_encoding('')" */
3889         len += strlen(enc0->name);
3890     }
3891
3892     result = rb_str_new5(str, 0, len);
3893     p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
3894     q = RSTRING_PTR(result); qend = q + len;
3895
3896     *q++ = '"';
3897     while (p < pend) {
3898         unsigned char c = *p++;
3899
3900         if (c == '"' || c == '\\') {
3901             *q++ = '\\';
3902             *q++ = c;
3903         }
3904         else if (c == '#') {
3905             if (IS_EVSTR(p, pend)) *q++ = '\\';
3906             *q++ = '#';
3907         }
3908         else if (c == '\n') {
3909             *q++ = '\\';
3910             *q++ = 'n';
3911         }
3912         else if (c == '\r') {
3913             *q++ = '\\';
3914             *q++ = 'r';
3915         }
3916         else if (c == '\t') {
3917             *q++ = '\\';
3918             *q++ = 't';
3919         }
3920         else if (c == '\f') {
3921             *q++ = '\\';
3922             *q++ = 'f';
3923         }
3924         else if (c == '\013') {
3925             *q++ = '\\';
3926             *q++ = 'v';
3927         }
3928         else if (c == '\010') {
3929             *q++ = '\\';
3930             *q++ = 'b';
3931         }
3932         else if (c == '\007') {
3933             *q++ = '\\';
3934             *q++ = 'a';
3935         }
3936         else if (c == '\033') {
3937             *q++ = '\\';
3938             *q++ = 'e';
3939         }
3940         else if (ISPRINT(c)) {
3941             *q++ = c;
3942         }
3943         else {
3944             *q++ = '\\';
3945             sprintf(q, "x%02X", c);
3946             q += 3;
3947         }
3948     }
3949     *q++ = '"';
3950     if (!rb_enc_asciicompat(enc0)) {
3951         sprintf(q, ".force_encoding(\"%s\")", enc0->name);
3952         enc0 = rb_ascii8bit_encoding();
3953     }
3954
3955     OBJ_INFECT(result, str);
3956     /* result from dump is ASCII */
3957     rb_enc_associate(result, enc0);
3958     return result;
3959 }
3960
3961
3962 /*
3963  *  call-seq:
3964  *     str.upcase!   => str or nil
3965  *
3966  *  Upcases the contents of <i>str</i>, returning <code>nil</code> if no changes
3967  *  were made.
3968  *  Note: case replacement is effective only in ASCII region.
3969  */
3970
3971 static VALUE
3972 rb_str_upcase_bang(VALUE str)
3973 {
3974     rb_encoding *enc;
3975     char *s, *send;
3976     int modify = 0;
3977     int cr = ENC_CODERANGE(str);
3978
3979     rb_str_modify(str);
3980     enc = STR_ENC_GET(str);
3981     s = RSTRING_PTR(str); send = RSTRING_END(str);
3982     while (s < send) {
3983         int c = rb_enc_codepoint(s, send, enc);
3984
3985         if (rb_enc_islower(c, enc)) {
3986             /* assuming toupper returns codepoint with same size */
3987             rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
3988             modify = 1;
3989         }
3990         s += rb_enc_codelen(c, enc);
3991     }
3992
3993     ENC_CODERANGE_SET(str, cr);
3994     if (modify) return str;
3995     return Qnil;
3996 }
3997
3998
3999 /*
4000  *  call-seq:
4001  *     str.upcase   => new_str
4002  *
4003  *  Returns a copy of <i>str</i> with all lowercase letters replaced with their
4004  *  uppercase counterparts. The operation is locale insensitive---only
4005  *  characters ``a'' to ``z'' are affected.
4006  *  Note: case replacement is effective only in ASCII region.
4007  *
4008  *     "hEllO".upcase   #=> "HELLO"
4009  */
4010
4011 static VALUE
4012 rb_str_upcase(VALUE str)
4013 {
4014     str = rb_str_dup(str);
4015     rb_str_upcase_bang(str);
4016     return str;
4017 }
4018
4019
4020 /*
4021  *  call-seq:
4022  *     str.downcase!   => str or nil
4023  *
4024  *  Downcases the contents of <i>str</i>, returning <code>nil</code> if no
4025  *  changes were made.
4026  *  Note: case replacement is effective only in ASCII region.
4027  */
4028
4029 static VALUE
4030 rb_str_downcase_bang(VALUE str)
4031 {
4032     rb_encoding *enc;
4033     char *s, *send;
4034     int modify = 0;
4035     int cr = ENC_CODERANGE(str);
4036
4037     rb_str_modify(str);
4038     enc = STR_ENC_GET(str);
4039     s = RSTRING_PTR(str); send = RSTRING_END(str);
4040     while (s < send) {
4041         int c = rb_enc_codepoint(s, send, enc);
4042
4043         if (rb_enc_isupper(c, enc)) {
4044             /* assuming toupper returns codepoint with same size */
4045             rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
4046             modify = 1;
4047         }
4048         s += rb_enc_codelen(c, enc);
4049     }
4050
4051     ENC_CODERANGE_SET(str, cr);
4052     if (modify) return str;
4053     return Qnil;
4054 }
4055
4056
4057 /*
4058  *  call-seq:
4059  *     str.downcase   => new_str
4060  *
4061  *  Returns a copy of <i>str</i> with all uppercase letters replaced with their
4062  *  lowercase counterparts. The operation is locale insensitive---only
4063  *  characters ``A'' to ``Z'' are affected.
4064  *  Note: case replacement is effective only in ASCII region.
4065  *
4066  *     "hEllO".downcase   #=> "hello"
4067  */
4068
4069 static VALUE
4070 rb_str_downcase(VALUE str)
4071 {
4072     str = rb_str_dup(str);
4073     rb_str_downcase_bang(str);
4074     return str;
4075 }
4076
4077
4078 /*
4079  *  call-seq:
4080  *     str.capitalize!   => str or nil
4081  *
4082  *  Modifies <i>str</i> by converting the first character to uppercase and the
4083  *  remainder to lowercase. Returns <code>nil</code> if no changes are made.
4084  *  Note: case conversion is effective only in ASCII region.
4085  *
4086  *     a = "hello"
4087  *     a.capitalize!   #=> "Hello"
4088  *     a               #=> "Hello"
4089  *     a.capitalize!   #=> nil
4090  */
4091
4092 static VALUE
4093 rb_str_capitalize_bang(VALUE str)
4094 {
4095     rb_encoding *enc;
4096     char *s, *send;
4097     int modify = 0;
4098     int c;
4099     int cr = ENC_CODERANGE(str);
4100
4101     rb_str_modify(str);
4102     enc = STR_ENC_GET(str);
4103     if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
4104     s = RSTRING_PTR(str); send = RSTRING_END(str);
4105
4106     c = rb_enc_codepoint(s, send, enc);
4107     if (rb_enc_islower(c, enc)) {
4108         rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
4109         modify = 1;
4110     }
4111     s += rb_enc_codelen(c, enc);
4112     while (s < send) {
4113         c = rb_enc_codepoint(s, send, enc);
4114         if (rb_enc_isupper(c, enc)) {
4115             rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
4116             modify = 1;
4117         }
4118         s += rb_enc_codelen(c, enc);
4119     }
4120
4121     ENC_CODERANGE_SET(str, cr);
4122     if (modify) return str;
4123     return Qnil;
4124 }
4125
4126
4127 /*
4128  *  call-seq:
4129  *     str.capitalize   => new_str
4130  *
4131  *  Returns a copy of <i>str</i> with the first character converted to uppercase
4132  *  and the remainder to lowercase.
4133  *  Note: case conversion is effective only in ASCII region.
4134  *
4135  *     "hello".capitalize    #=> "Hello"
4136  *     "HELLO".capitalize    #=> "Hello"
4137  *     "123ABC".capitalize   #=> "123abc"
4138  */
4139
4140 static VALUE
4141 rb_str_capitalize(VALUE str)
4142 {
4143     str = rb_str_dup(str);
4144     rb_str_capitalize_bang(str);
4145     return str;
4146 }
4147
4148
4149 /*
4150  *  call-seq:
4151 *     str.swapcase!   => str or nil
4152  *
4153  *  Equivalent to <code>String#swapcase</code>, but modifies the receiver in
4154  *  place, returning <i>str</i>, or <code>nil</code> if no changes were made.
4155  *  Note: case conversion is effective only in ASCII region.
4156  */
4157
4158 static VALUE
4159 rb_str_swapcase_bang(VALUE str)
4160 {
4161     rb_encoding *enc;
4162     char *s, *send;
4163     int modify = 0;
4164     int cr = ENC_CODERANGE(str);
4165
4166     rb_str_modify(str);
4167     enc = STR_ENC_GET(str);
4168     s = RSTRING_PTR(str); send = RSTRING_END(str);
4169     while (s < send) {
4170         int c = rb_enc_codepoint(s, send, enc);
4171
4172         if (rb_enc_isupper(c, enc)) {
4173             /* assuming toupper returns codepoint with same size */
4174             rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
4175             modify = 1;
4176         }
4177         else if (rb_enc_islower(c, enc)) {
4178             /* assuming toupper returns codepoint with same size */
4179             rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
4180             modify = 1;
4181         }
4182         s += rb_enc_codelen(c, enc);
4183     }
4184
4185     ENC_CODERANGE_SET(str, cr);
4186     if (modify) return str;
4187     return Qnil;
4188 }
4189
4190
4191 /*
4192  *  call-seq:
4193  *     str.swapcase   => new_str
4194  *
4195  *  Returns a copy of <i>str</i> with uppercase alphabetic characters converted
4196  *  to lowercase and lowercase characters converted to uppercase.
4197  *  Note: case conversion is effective only in ASCII region.
4198  *
4199  *     "Hello".swapcase          #=> "hELLO"
4200  *     "cYbEr_PuNk11".swapcase   #=> "CyBeR_pUnK11"
4201  */
4202
4203 static VALUE
4204 rb_str_swapcase(VALUE str)
4205 {
4206     str = rb_str_dup(str);
4207     rb_str_swapcase_bang(str);
4208     return str;
4209 }
4210
4211 typedef unsigned char *USTR;
4212
4213 struct tr {
4214     int gen, now, max;
4215     char *p, *pend;
4216 };
4217
4218 static int
4219 trnext(struct tr *t, rb_encoding *enc)
4220 {
4221     for (;;) {
4222         if (!t->gen) {
4223             if (t->p == t->pend) return -1;
4224             if (t->p < t->pend - 1 && *t->p == '\\') {
4225                 t->p++;
4226             }
4227             t->now = rb_enc_codepoint(t->p, t->pend, enc);
4228             t->p += rb_enc_codelen(t->now, enc);
4229             if (t->p < t->pend - 1 && *t->p == '-') {
4230                 t->p++;
4231                 if (t->p < t->pend) {
4232                     int c = rb_enc_codepoint(t->p, t->pend, enc);
4233                     t->p += rb_enc_codelen(c, enc);
4234                     if (t->now > c) continue;
4235                     t->gen = 1;
4236                     t->max = c;
4237                 }
4238             }
4239             return t->now;
4240         }
4241         else if (++t->now < t->max) {
4242             return t->now;
4243         }
4244         else {
4245             t->gen = 0;
4246             return t->max;
4247         }
4248     }
4249 }
4250
4251 static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
4252
4253 static VALUE
4254 tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
4255 {
4256     int trans[256];
4257     rb_encoding *enc, *e1, *e2;
4258     struct tr trsrc, trrepl;
4259     int cflag = 0;
4260     int c, c0, last = 0, modify = 0, i, l;
4261     char *s, *send;
4262     VALUE hash = 0;
4263     int singlebyte = single_byte_optimizable(str);
4264
4265     StringValue(src);
4266     StringValue(repl);
4267     if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
4268     if (RSTRING_LEN(repl) == 0) {
4269         return rb_str_delete_bang(1, &src, str);
4270     }
4271
4272     e1 = rb_enc_check(str, src);
4273     e2 = rb_enc_check(str, repl);
4274     if (e1 == e2) {
4275         enc = e1;
4276     }
4277     else {
4278         enc = rb_enc_check(src, repl);
4279     }
4280     trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
4281     if (RSTRING_LEN(src) > 1 &&
4282         rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
4283         trsrc.p + l < trsrc.pend) {
4284         cflag = 1;
4285         trsrc.p += l;
4286     }
4287     trrepl.p = RSTRING_PTR(repl);
4288     trrepl.pend = trrepl.p + RSTRING_LEN(repl);
4289     trsrc.gen = trrepl.gen = 0;
4290     trsrc.now = trrepl.now = 0;
4291     trsrc.max = trrepl.max = 0;
4292
4293     if (cflag) {
4294         for (i=0; i<256; i++) {
4295             trans[i] = 1;
4296         }
4297         while ((c = trnext(&trsrc, enc)) >= 0) {
4298             if (c < 256) {
4299                 trans[c] = -1;
4300             }
4301             else {
4302                 if (!hash) hash = rb_hash_new();
4303                 rb_hash_aset(hash, INT2NUM(c), Qtrue);
4304             }
4305         }
4306         while ((c = trnext(&trrepl, enc)) >= 0)
4307             /* retrieve last replacer */;
4308         last = trrepl.now;
4309         for (i=0; i<256; i++) {
4310             if (trans[i] >= 0) {
4311                 trans[i] = last;
4312             }
4313         }
4314     }
4315     else {
4316         int r;
4317
4318         for (i=0; i<256; i++) {
4319             trans[i] = -1;
4320         }
4321         while ((c = trnext(&trsrc, enc)) >= 0) {
4322             r = trnext(&trrepl, enc);
4323             if (r == -1) r = trrepl.now;
4324             if (c < 256) {
4325                 trans[c] = r;
4326                 if (r > 255) singlebyte = 0;
4327             }
4328             else {
4329                 if (!hash) hash = rb_hash_new();
4330                 rb_hash_aset(hash, INT2NUM(c), INT2NUM(r));
4331             }
4332         }
4333     }
4334
4335     rb_str_modify(str);
4336     s = RSTRING_PTR(str); send = RSTRING_END(str);
4337     if (sflag) {
4338         int clen, tlen, max = RSTRING_LEN(str);
4339         int offset, save = -1;
4340         char *buf = ALLOC_N(char, max), *t = buf;
4341
4342         while (s < send) {
4343             c0 = c = rb_enc_codepoint(s, send, enc);
4344             tlen = clen = rb_enc_codelen(c, enc);
4345
4346             s += clen;
4347             if (c < 256) {
4348                 c = trans[c];
4349             }
4350             else if (hash) {
4351                 VALUE tmp = rb_hash_lookup(hash, INT2NUM(c));
4352                 if (NIL_P(tmp)) {
4353                     if (cflag) c = last;
4354                     else c = -1;
4355                 }
4356                 else if (cflag) c = -1;
4357                 else c = NUM2INT(tmp);
4358             }
4359             else {
4360                 c = -1;
4361             }
4362             if (c >= 0) {
4363                 if (save == c) continue;
4364                 save = c;
4365                 tlen = rb_enc_codelen(c, enc);
4366                 modify = 1;
4367             }
4368             else {
4369                 save = -1;
4370                 c = c0;
4371             }
4372             while (t - buf + tlen >= max) {
4373                 offset = t - buf;
4374                 max *= 2;
4375                 REALLOC_N(buf, char, max);
4376                 t = buf + offset;
4377             }
4378             rb_enc_mbcput(c, t, enc);
4379             t += tlen;
4380         }
4381         *t = '\0';
4382         RSTRING(str)->as.heap.ptr = buf;
4383         RSTRING(str)->as.heap.len = t - buf;
4384         STR_SET_NOEMBED(str);
4385         RSTRING(str)->as.heap.aux.capa = max;
4386     }
4387     else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
4388         while (s < send) {
4389             c = (unsigned char)*s;
4390             if (trans[c] >= 0) {
4391                 if (!cflag) {
4392                     c = trans[c];
4393                     *s = c;
4394                     modify = 1;
4395                 }
4396                 else {
4397                     *s = last;
4398                     modify = 1;
4399                 }
4400             }
4401             s++;
4402         }
4403     }
4404     else {
4405         int clen, tlen, max = RSTRING_LEN(str) * 1.2;
4406         int offset;
4407         char *buf = ALLOC_N(char, max), *t = buf;
4408
4409         while (s < send) {
4410             c0 = c = rb_enc_codepoint(s, send, enc);
4411             tlen = clen = rb_enc_codelen(c, enc);
4412
4413             if (c < 256) {
4414                 c = trans[c];
4415             }
4416             else if (hash) {
4417                 VALUE tmp = rb_hash_lookup(hash, INT2NUM(c));
4418                 if (NIL_P(tmp)) {
4419                     if (cflag) c = last;
4420                     else c = -1;
4421                 }
4422                 else if (cflag) c = -1;
4423                 else c = NUM2INT(tmp);
4424             }
4425             else {
4426                 c = -1;
4427             }
4428             if (c >= 0) {
4429                 tlen = rb_enc_codelen(c, enc);
4430                 modify = 1;
4431             }
4432             else {
4433                 modify = 1;
4434                 c = c0;
4435             }
4436             while (t - buf + tlen >= max) {
4437                 offset = t - buf;
4438                 max *= 2;
4439                 REALLOC_N(buf, char, max);
4440                 t = buf + offset;
4441             }
4442             if (s != t) rb_enc_mbcput(c, t, enc);
4443             s += clen;
4444             t += tlen;
4445         }
4446         if (!STR_EMBED_P(str)) {
4447             xfree(RSTRING(str)->as.heap.ptr);
4448         }
4449         *t = '\0';
4450         RSTRING(str)->as.heap.ptr = buf;
4451         RSTRING(str)->as.heap.len = t - buf;
4452         STR_SET_NOEMBED(str);
4453         RSTRING(str)->as.heap.aux.capa = max;
4454     }
4455
4456     if (modify) {
4457         rb_enc_associate(str, enc);
4458         return str;
4459     }
4460     return Qnil;
4461 }
4462
4463
4464 /*
4465  *  call-seq:
4466  *     str.tr!(from_str, to_str)   => str or nil
4467  *
4468  *  Translates <i>str</i> in place, using the same rules as
4469  *  <code>String#tr</code>. Returns <i>str</i>, or <code>nil</code> if no
4470  *  changes were made.
4471  */
4472
4473 static VALUE
4474 rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
4475 {
4476     return tr_trans(str, src, repl, 0);
4477 }
4478
4479
4480 /*
4481  *  call-seq:
4482  *     str.tr(from_str, to_str)   => new_str
4483  *
4484  *  Returns a copy of <i>str</i> with the characters in <i>from_str</i> replaced
4485  *  by the corresponding characters in <i>to_str</i>. If <i>to_str</i> is
4486  *  shorter than <i>from_str</i>, it is padded with its last character. Both
4487  *  strings may use the c1--c2 notation to denote ranges of characters, and
4488  *  <i>from_str</i> may start with a <code>^</code>, which denotes all
4489  *  characters except those listed.
4490  *
4491  *     "hello".tr('aeiou', '*')    #=> "h*ll*"
4492  *     "hello".tr('^aeiou', '*')   #=> "*e**o"
4493  *     "hello".tr('el', 'ip')      #=> "hippo"
4494  *     "hello".tr('a-y', 'b-z')    #=> "ifmmp"
4495  */
4496
4497 static VALUE
4498 rb_str_tr(VALUE str, VALUE src, VALUE repl)
4499 {
4500     str = rb_str_dup(str);
4501     tr_trans(str, src, repl, 0);
4502     return str;
4503 }
4504
4505 static void
4506 tr_setup_table(VALUE str, char stable[256], int first,
4507                VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
4508 {
4509     char buf[256];
4510     struct tr tr;
4511     int c, l;
4512     VALUE table = 0, ptable = 0;
4513     int i, cflag = 0;
4514
4515     tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
4516     tr.gen = tr.now = tr.max = 0;
4517
4518     if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
4519         cflag = 1;
4520         tr.p += l;
4521     }
4522     if (first) {
4523         for (i=0; i<256; i++) {
4524             stable[i] = 1;
4525         }
4526     }
4527     for (i=0; i<256; i++) {
4528         buf[i] = cflag;
4529     }
4530
4531     while ((c = trnext(&tr, enc)) >= 0) {
4532         if (c < 256) {
4533             buf[c & 0xff] = !cflag;
4534         }
4535         else {
4536             VALUE key = INT2NUM(c);
4537
4538             if (!table) {
4539                 table = rb_hash_new();
4540                 if (cflag) {
4541                     ptable = *ctablep;
4542                     *ctablep = table;
4543                 }
4544                 else {
4545                     ptable = *tablep;
4546                     *tablep = table;
4547                 }
4548             }
4549             if (!ptable || !NIL_P(rb_hash_aref(ptable, key))) {
4550                 rb_hash_aset(table, key, Qtrue);
4551             }
4552         }
4553     }
4554     for (i=0; i<256; i++) {
4555         stable[i] = stable[i] && buf[i];
4556     }
4557 }
4558
4559
4560 static int
4561 tr_find(int c, char table[256], VALUE del, VALUE nodel)
4562 {
4563     if (c < 256) {
4564         return table[c] ? Qtrue : Qfalse;
4565     }
4566     else {
4567         VALUE v = INT2NUM(c);
4568
4569         if (del && !NIL_P(rb_hash_lookup(del, v))) {
4570             if (!nodel || NIL_P(rb_hash_lookup(nodel, v))) {
4571                 return Qtrue;
4572             }
4573         }
4574         return Qfalse;
4575     }
4576 }
4577
4578 /*
4579  *  call-seq:
4580  *     str.delete!([other_str]+)   => str or nil
4581  *
4582  *  Performs a <code>delete</code> operation in place, returning <i>str</i>, or
4583  *  <code>nil</code> if <i>str</i> was not modified.
4584  */
4585
4586 static VALUE
4587 rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
4588 {
4589     char squeez[256];
4590     rb_encoding *enc = 0;
4591     char *s, *send, *t;
4592     VALUE del = 0, nodel = 0;
4593     int modify = 0;
4594     int i;
4595     int cr;
4596
4597     if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
4598     cr = ENC_CODERANGE(str);
4599     if (argc < 1) {
4600         rb_raise(rb_eArgError, "wrong number of arguments");
4601     }
4602     for (i=0; i<argc; i++) {
4603         VALUE s = argv[i];
4604
4605         StringValue(s);
4606         enc = rb_enc_check(str, s);
4607         tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
4608     }
4609
4610     rb_str_modify(str);
4611     s = t = RSTRING_PTR(str);
4612     if (!s || RSTRING_LEN(str) == 0) return Qnil;
4613     send = RSTRING_END(str);
4614     while (s < send) {
4615         int c = rb_enc_codepoint(s, send, enc);
4616         int clen = rb_enc_codelen(c, enc);
4617
4618         if (tr_find(c, squeez, del, nodel)) {
4619             modify = 1;
4620         }
4621         else {
4622             if (t != s) rb_enc_mbcput(c, t, enc);
4623             t += clen;
4624         }
4625         s += clen;
4626     }
4627     *t = '\0';
4628     STR_SET_LEN(str, t - RSTRING_PTR(str));
4629
4630     ENC_CODERANGE_SET(str, cr);
4631     if (modify) return str;
4632     return Qnil;
4633 }
4634
4635
4636 /*
4637  *  call-seq:
4638  *     str.delete([other_str]+)   => new_str
4639  *
4640  *  Returns a copy of <i>str</i> with all characters in the intersection of its
4641  *  arguments deleted. Uses the same rules for building the set of characters as
4642  *  <code>String#count</code>.
4643  *
4644  *     "hello".delete "l","lo"        #=> "heo"
4645  *     "hello".delete "lo"            #=> "he"
4646  *     "hello".delete "aeiou", "^e"   #=> "hell"
4647  *     "hello".delete "ej-m"          #=> "ho"
4648  */
4649
4650 static VALUE
4651 rb_str_delete(int argc, VALUE *argv, VALUE str)
4652 {
4653     str = rb_str_dup(str);
4654     rb_str_delete_bang(argc, argv, str);
4655     return str;
4656 }
4657
4658
4659 /*
4660  *  call-seq:
4661  *     str.squeeze!([other_str]*)   => str or nil
4662  *
4663  *  Squeezes <i>str</i> in place, returning either <i>str</i>, or
4664  *  <code>nil</code> if no changes were made.
4665  */
4666
4667 static VALUE
4668 rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
4669 {
4670     char squeez[256];
4671     rb_encoding *enc = 0;
4672     VALUE del = 0, nodel = 0;
4673     char *s, *send, *t;
4674     int save, modify = 0;
4675     int i;
4676
4677     if (argc == 0) {
4678         enc = STR_ENC_GET(str);
4679     }
4680     else {
4681         for (i=0; i<argc; i++) {
4682             VALUE s = argv[i];
4683
4684             StringValue(s);
4685             enc = rb_enc_check(str, s);
4686             tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
4687         }
4688     }
4689
4690     rb_str_modify(str);
4691     s = t = RSTRING_PTR(str);
4692     if (!s || RSTRING_LEN(str) == 0) return Qnil;
4693     send = RSTRING_END(str);
4694     save = -1;
4695     while (s < send) {
4696         int c = rb_enc_codepoint(s, send, enc);
4697         int clen = rb_enc_codelen(c, enc);
4698
4699         if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
4700             if (t != s) rb_enc_mbcput(c, t, enc);
4701             save = c;
4702             t += clen;
4703         }
4704         s += clen;
4705     }
4706     *t = '\0';
4707     if (t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
4708         STR_SET_LEN(str, t - RSTRING_PTR(str));
4709         modify = 1;
4710     }
4711
4712     if (modify) return str;
4713     return Qnil;
4714 }
4715
4716
4717 /*
4718  *  call-seq:
4719  *     str.squeeze([other_str]*)    => new_str
4720  *
4721  *  Builds a set of characters from the <i>other_str</i> parameter(s) using the
4722  *  procedure described for <code>String#count</code>. Returns a new string
4723  *  where runs of the same character that occur in this set are replaced by a
4724  *  single character. If no arguments are given, all runs of identical
4725  *  characters are replaced by a single character.
4726  *
4727  *     "yellow moon".squeeze                  #=> "yelow mon"
4728  *     "  now   is  the".squeeze(" ")         #=> " now is the"
4729  *     "putters shoot balls".squeeze("m-z")   #=> "puters shot balls"
4730  */
4731
4732 static VALUE
4733 rb_str_squeeze(int argc, VALUE *argv, VALUE str)
4734 {
4735     str = rb_str_dup(str);
4736     rb_str_squeeze_bang(argc, argv, str);
4737     return str;
4738 }
4739
4740
4741 /*
4742  *  call-seq:
4743  *     str.tr_s!(from_str, to_str)   => str or nil
4744  *
4745  *  Performs <code>String#tr_s</code> processing on <i>str</i> in place,
4746  *  returning <i>str</i>, or <code>nil</code> if no changes were made.
4747  */
4748
4749 static VALUE
4750 rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
4751 {
4752     return tr_trans(str, src, repl, 1);
4753 }
4754
4755
4756 /*
4757  *  call-seq:
4758  *     str.tr_s(from_str, to_str)   => new_str
4759  *
4760  *  Processes a copy of <i>str</i> as described under <code>String#tr</code>,
4761  *  then removes duplicate characters in regions that were affected by the
4762  *  translation.
4763  *
4764  *     "hello".tr_s('l', 'r')     #=> "hero"
4765  *     "hello".tr_s('el', '*')    #=> "h*o"
4766  *     "hello".tr_s('el', 'hx')   #=> "hhxo"
4767  */
4768
4769 static VALUE
4770 rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
4771 {
4772     str = rb_str_dup(str);
4773     tr_trans(str, src, repl, 1);
4774     return str;
4775 }
4776
4777
4778 /*
4779  *  call-seq:
4780  *     str.count([other_str]+)   => fixnum
4781  *
4782  *  Each <i>other_str</i> parameter defines a set of characters to count.  The
4783  *  intersection of these sets defines the characters to count in
4784  *  <i>str</i>. Any <i>other_str</i> that starts with a caret (^) is
4785  *  negated. The sequence c1--c2 means all characters between c1 and c2.
4786  *
4787  *     a = "hello world"
4788  *     a.count "lo"            #=> 5
4789  *     a.count "lo", "o"       #=> 2
4790  *     a.count "hello", "^l"   #=> 4
4791  *     a.count "ej-m"          #=> 4
4792  */
4793
4794 static VALUE
4795 rb_str_count(int argc, VALUE *argv, VALUE str)
4796 {
4797     char table[256];
4798     rb_encoding *enc = 0;
4799     VALUE del = 0, nodel = 0;
4800     char *s, *send;
4801     int i;
4802
4803     if (argc < 1) {
4804         rb_raise(rb_eArgError, "wrong number of arguments");
4805     }
4806     for (i=0; i<argc; i++) {
4807         VALUE s = argv[i];
4808
4809         StringValue(s);
4810         enc = rb_enc_check(str, s);
4811         tr_setup_table(s, table,i==0, &del, &nodel, enc);
4812     }
4813
4814     s = RSTRING_PTR(str);
4815     if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
4816     send = RSTRING_END(str);
4817     i = 0;
4818     while (s < send) {
4819         int c = rb_enc_codepoint(s, send, enc);
4820         int clen = rb_enc_codelen(c, enc);
4821
4822         if (tr_find(c, table, del, nodel)) {
4823             i++;
4824         }
4825         s += clen;
4826     }
4827     return INT2NUM(i);
4828 }
4829
4830
4831 /*
4832  *  call-seq:
4833  *     str.split(pattern=$;, [limit])   => anArray
4834  *
4835  *  Divides <i>str</i> into substrings based on a delimiter, returning an array
4836  *  of these substrings.
4837  *
4838  *  If <i>pattern</i> is a <code>String</code>, then its contents are used as
4839  *  the delimiter when splitting <i>str</i>. If <i>pattern</i> is a single
4840  *  space, <i>str</i> is split on whitespace, with leading whitespace and runs
4841  *  of contiguous whitespace characters ignored.
4842  *
4843  *  If <i>pattern</i> is a <code>Regexp</code>, <i>str</i> is divided where the
4844  *  pattern matches. Whenever the pattern matches a zero-length string,
4845  *  <i>str</i> is split into individual characters. If <i>pattern</i> contains
4846  *  groups, the respective matches will be returned in the array as well.
4847  *
4848  *  If <i>pattern</i> is omitted, the value of <code>$;</code> is used.  If
4849  *  <code>$;</code> is <code>nil</code> (which is the default), <i>str</i> is
4850  *  split on whitespace as if ` ' were specified.
4851  *
4852  *  If the <i>limit</i> parameter is omitted, trailing null fields are
4853  *  suppressed. If <i>limit</i> is a positive number, at most that number of
4854  *  fields will be returned (if <i>limit</i> is <code>1</code>, the entire
4855  *  string is returned as the only entry in an array). If negative, there is no
4856  *  limit to the number of fields returned, and trailing null fields are not
4857  *  suppressed.
4858  *
4859  *     " now's  the time".split        #=> ["now's", "the", "time"]
4860  *     " now's  the time".split(' ')   #=> ["now's", "the", "time"]
4861  *     " now's  the time".split(/ /)   #=> ["", "now's", "", "the", "time"]
4862  *     "1, 2.34,56, 7".split(%r{,\s*}) #=> ["1", "2.34", "56", "7"]
4863  *     "hello".split(//)               #=> ["h", "e", "l", "l", "o"]
4864  *     "hello".split(//, 3)            #=> ["h", "e", "llo"]
4865  *     "hi mom".split(%r{\s*})         #=> ["h", "i", "m", "o", "m"]
4866  *
4867  *     "mellow yellow".split("ello")   #=> ["m", "w y", "w"]
4868  *     "1,2,,3,4,,".split(',')         #=> ["1", "2", "", "3", "4"]
4869  *     "1,2,,3,4,,".split(',', 4)      #=> ["1", "2", "", "3,4,,"]
4870  *     "1,2,,3,4,,".split(',', -4)     #=> ["1", "2", "", "3", "4", "", ""]
4871  */
4872
4873 static VALUE
4874 rb_str_split_m(int argc, VALUE *argv, VALUE str)
4875 {
4876     rb_encoding *enc;
4877     VALUE spat;
4878     VALUE limit;
4879     int awk_split = Qfalse;
4880     long beg, end, i = 0;
4881     int lim = 0;
4882     VALUE result, tmp;
4883
4884     if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
4885         lim = NUM2INT(limit);
4886         if (lim <= 0) limit = Qnil;
4887         else if (lim == 1) {
4888             if (RSTRING_LEN(str) == 0)
4889                 return rb_ary_new2(0);
4890             return rb_ary_new3(1, str);
4891         }
4892         i = 1;
4893     }
4894
4895     enc = STR_ENC_GET(str);
4896     if (NIL_P(spat)) {
4897         if (!NIL_P(rb_fs)) {
4898             spat = rb_fs;
4899             goto fs_set;
4900         }
4901         awk_split = Qtrue;
4902     }
4903     else {
4904       fs_set:
4905         if (TYPE(spat) == T_STRING) {
4906             rb_encoding *enc2 = STR_ENC_GET(spat);
4907
4908             if (rb_enc_mbminlen(enc2) == 1) {
4909                 if (RSTRING_LEN(spat) == 1 && RSTRING_PTR(spat)[0] == ' '){
4910                     awk_split = Qtrue;
4911                 }
4912             }
4913             else {
4914                 int l;
4915                 if (rb_enc_ascget(RSTRING_PTR(spat), RSTRING_END(spat), &l, enc2) == ' ' &&
4916                     RSTRING_LEN(spat) == l) {
4917                     awk_split = Qtrue;
4918                 }
4919             }
4920             if (!awk_split) {
4921                 spat = rb_reg_regcomp(rb_reg_quote(spat));
4922             }
4923         }
4924         else {
4925             spat = get_pat(spat, 1);
4926         }
4927     }
4928
4929     result = rb_ary_new();
4930     beg = 0;
4931     if (awk_split) {
4932         char *ptr = RSTRING_PTR(str);
4933         char *eptr = RSTRING_END(str);
4934         char *bptr = ptr;
4935         int skip = 1;
4936         int c;
4937
4938         end = beg;
4939         while (ptr < eptr) {
4940             c = rb_enc_codepoint(ptr, eptr, enc);
4941             ptr += rb_enc_mbclen(ptr, eptr, enc);
4942             if (skip) {
4943                 if (rb_enc_isspace(c, enc)) {
4944                     beg = ptr - bptr;
4945                 }
4946                 else {
4947                     end = ptr - bptr;
4948                     skip = 0;
4949                     if (!NIL_P(limit) && lim <= i) break;
4950                 }
4951             }
4952             else {
4953                 if (rb_enc_isspace(c, enc)) {
4954                     rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
4955                     skip = 1;
4956                     beg = ptr - bptr;
4957                     if (!NIL_P(limit)) ++i;
4958                 }
4959                 else {
4960                     end = ptr - bptr;
4961                 }
4962             }
4963         }
4964     }
4965     else {
4966         long start = beg;
4967         long idx;
4968         int last_null = 0;
4969         struct re_registers *regs;
4970
4971         while ((end = rb_reg_search(spat, str, start, 0)) >= 0) {
4972             regs = RMATCH_REGS(rb_backref_get());
4973             if (start == end && BEG(0) == END(0)) {
4974                 if (!RSTRING_PTR(str)) {
4975                     rb_ary_push(result, rb_str_new("", 0));
4976                     break;
4977                 }
4978                 else if (last_null == 1) {
4979                     rb_ary_push(result, rb_str_subseq(str, beg,
4980                                                       rb_enc_mbclen(RSTRING_PTR(str)+beg,
4981                                                                     RSTRING_END(str),
4982                                                                     enc)));
4983                     beg = start;
4984                 }
4985                 else {
4986                     if (RSTRING_PTR(str)+start == RSTRING_END(str))
4987                         start++;
4988                     else
4989                         start += rb_enc_mbclen(RSTRING_PTR(str)+start,RSTRING_END(str),enc);
4990                     last_null = 1;
4991                     continue;
4992                 }
4993             }
4994             else {
4995                 rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
4996                 beg = start = END(0);
4997             }
4998             last_null = 0;
4999
5000             for (idx=1; idx < regs->num_regs; idx++) {
5001                 if (BEG(idx) == -1) continue;
5002                 if (BEG(idx) == END(idx))
5003                     tmp = rb_str_new5(str, 0, 0);
5004                 else
5005                     tmp = rb_str_subseq(str, BEG(idx), END(idx)-BEG(idx));
5006                 rb_ary_push(result, tmp);
5007             }
5008             if (!NIL_P(limit) && lim <= ++i) break;
5009         }
5010     }
5011     if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
5012         if (RSTRING_LEN(str) == beg)
5013             tmp = rb_str_new5(str, 0, 0);
5014         else
5015             tmp = rb_str_subseq(str, beg, RSTRING_LEN(str)-beg);
5016         rb_ary_push(result, tmp);
5017     }
5018     if (NIL_P(limit) && lim == 0) {
5019         while (RARRAY_LEN(result) > 0 &&
5020                RSTRING_LEN(RARRAY_PTR(result)[RARRAY_LEN(result)-1]) == 0)
5021             rb_ary_pop(result);
5022     }
5023
5024     return result;
5025 }
5026
5027 VALUE
5028 rb_str_split(VALUE str, const char *sep0)
5029 {
5030     VALUE sep;
5031
5032     StringValue(str);
5033     sep = rb_str_new2(sep0);
5034     return rb_str_split_m(1, &sep, str);
5035 }
5036
5037
5038 /*
5039  *  Document-method: lines
5040  *  call-seq:
5041  *     str.lines(separator=$/)   => anEnumerator
5042  *     str.lines(separator=$/) {|substr| block }        => str
5043  *
5044  *  Returns an enumerator that gives each line in the string.  If a block is
5045  *  given, it iterates over each line in the string.
5046  *
5047  *     "foo\nbar\n".lines.to_a   #=> ["foo\n", "bar\n"]
5048  *     "foo\nb ar".lines.sort    #=> ["b ar", "foo\n"]
5049  */
5050
5051 /*
5052  *  Document-method: each_line
5053  *  call-seq:
5054  *     str.each_line(separator=$/) {|substr| block }   => str
5055  *
5056  *  Splits <i>str</i> using the supplied parameter as the record separator
5057  *  (<code>$/</code> by default), passing each substring in turn to the supplied
5058  *  block. If a zero-length record separator is supplied, the string is split
5059  *  into paragraphs delimited by multiple successive newlines.
5060  *
5061  *     print "Example one\n"
5062  *     "hello\nworld".each {|s| p s}
5063  *     print "Example two\n"
5064  *     "hello\nworld".each('l') {|s| p s}
5065  *     print "Example three\n"
5066  *     "hello\n\n\nworld".each('') {|s| p s}
5067  *
5068  *  <em>produces:</em>
5069  *
5070  *     Example one
5071  *     "hello\n"
5072  *     "world"
5073  *     Example two
5074  *     "hel"
5075  *     "l"
5076  *     "o\nworl"
5077  *     "d"
5078  *     Example three
5079  *     "hello\n\n\n"
5080  *     "world"
5081  */
5082
5083 static VALUE
5084 rb_str_each_line(int argc, VALUE *argv, VALUE str)
5085 {
5086     rb_encoding *enc;
5087     VALUE rs;
5088     int newline;
5089     char *p, *pend, *s, *ptr;
5090     long len, rslen;
5091     VALUE line;
5092     int n;
5093     VALUE orig = str;
5094
5095     if (argc == 0) {
5096         rs = rb_rs;
5097     }
5098     else {
5099         rb_scan_args(argc, argv, "01", &rs);
5100     }
5101     RETURN_ENUMERATOR(str, argc, argv);
5102     if (NIL_P(rs)) {
5103         rb_yield(str);
5104         return orig;
5105     }
5106     str = rb_str_new4(str);
5107     ptr = p = s = RSTRING_PTR(str);
5108     pend = p + RSTRING_LEN(str);
5109     len = RSTRING_LEN(str);
5110     StringValue(rs);
5111     if (rs == rb_default_rs) {
5112         enc = rb_enc_get(str);
5113         while (p < pend) {
5114             char *p0;
5115
5116             p = memchr(p, '\n', pend - p);
5117             if (!p) break;
5118             p0 = rb_enc_left_char_head(s, p, enc);
5119             if (!rb_enc_is_newline(p0, pend, enc)) {
5120                 p++;
5121                 continue;
5122             }
5123             p = p0 + rb_enc_mbclen(p0, pend, enc);
5124             line = rb_str_new5(str, s, p - s);
5125             OBJ_INFECT(line, str);
5126             rb_enc_cr_str_copy_for_substr(line, str);
5127             rb_yield(line);
5128             str_mod_check(str, ptr, len);
5129             s = p;
5130         }
5131         goto finish;
5132     }
5133
5134     enc = rb_enc_check(str, rs);
5135     rslen = RSTRING_LEN(rs);
5136     if (rslen == 0) {
5137         newline = '\n';
5138     }
5139     else {
5140         newline = rb_enc_codepoint(RSTRING_PTR(rs), RSTRING_END(rs), enc);
5141     }
5142
5143     while (p < pend) {
5144         int c = rb_enc_codepoint(p, pend, enc);
5145
5146       again:
5147         n = rb_enc_codelen(c, enc);
5148         if (rslen == 0 && c == newline) {
5149             p += n;
5150             if (p < pend && (c = rb_enc_codepoint(p, pend, enc)) != newline) {
5151                 goto again;
5152             }
5153             while (p < pend && rb_enc_codepoint(p, pend, enc) == newline) {
5154                 p += n;
5155             }
5156             p -= n;
5157         }
5158         if (c == newline &&
5159             (rslen <= 1 || memcmp(RSTRING_PTR(rs), p, rslen) == 0)) {
5160             line = rb_str_new5(str, s, p - s + (rslen ? rslen : n));
5161             OBJ_INFECT(line, str);
5162             rb_enc_cr_str_copy_for_substr(line, str);
5163             rb_yield(line);
5164             str_mod_check(str, ptr, len);
5165             s = p + (rslen ? rslen : n);
5166         }
5167         p += n;
5168     }
5169
5170   finish:
5171     if (s != pend) {
5172         line = rb_str_new5(str, s, pend - s);
5173         OBJ_INFECT(line, str);
5174         rb_enc_cr_str_copy_for_substr(line, str);
5175         rb_yield(line);
5176     }
5177
5178     return orig;
5179 }
5180
5181
5182 /*
5183  *  Document-method: bytes
5184  *  call-seq:
5185  *     str.bytes   => anEnumerator
5186  *     str.bytes {|fixnum| block }    => str
5187  *
5188  *  Returns an enumerator that gives each byte in the string.  If a block is
5189  *  given, it iterates over each byte in the string.
5190  *
5191  *     "hello".bytes.to_a        #=> [104, 101, 108, 108, 111]
5192  */
5193
5194 /*
5195  *  Document-method: each_byte
5196  *  call-seq:
5197  *     str.each_byte {|fixnum| block }    => str
5198  *
5199  *  Passes each byte in <i>str</i> to the given block.
5200  *
5201  *     "hello".each_byte {|c| print c, ' ' }
5202  *
5203  *  <em>produces:</em>
5204  *
5205  *     104 101 108 108 111
5206  */
5207
5208 static VALUE
5209 rb_str_each_byte(VALUE str)
5210 {
5211     long i;
5212
5213     RETURN_ENUMERATOR(str, 0, 0);
5214     for (i=0; i<RSTRING_LEN(str); i++) {
5215         rb_yield(INT2FIX(RSTRING_PTR(str)[i] & 0xff));
5216     }
5217     return str;
5218 }
5219
5220
5221 /*
5222  *  Document-method: chars
5223  *  call-seq:
5224  *     str.chars                   => anEnumerator
5225  *     str.chars {|substr| block } => str
5226  *
5227  *  Returns an enumerator that gives each character in the string.
5228  *  If a block is given, it iterates over each character in the string.
5229  *
5230  *     "foo".chars.to_a   #=> ["f","o","o"]
5231  */
5232
5233 /*
5234  *  Document-method: each_char
5235  *  call-seq:
5236  *     str.each_char {|cstr| block }    => str
5237  *
5238  *  Passes each character in <i>str</i> to the given block.
5239  *
5240  *     "hello".each_char {|c| print c, ' ' }
5241  *
5242  *  <em>produces:</em>
5243  *
5244  *     h e l l o
5245  */
5246
5247 static VALUE
5248 rb_str_each_char(VALUE str)
5249 {
5250     int i, len, n;
5251     const char *ptr;
5252     rb_encoding *enc;
5253
5254     RETURN_ENUMERATOR(str, 0, 0);
5255     str = rb_str_new4(str);
5256     ptr = RSTRING_PTR(str);
5257     len = RSTRING_LEN(str);
5258     enc = rb_enc_get(str);
5259     for (i = 0; i < len; i += n) {
5260         n = rb_enc_mbclen(ptr + i, ptr + len, enc);
5261         rb_yield(rb_str_subseq(str, i, n));
5262     }
5263     return str;
5264 }
5265
5266 static long
5267 chopped_length(VALUE str)
5268 {
5269     rb_encoding *enc = STR_ENC_GET(str);
5270     const char *p, *p2, *beg, *end;
5271
5272     beg = RSTRING_PTR(str);
5273     end = beg + RSTRING_LEN(str);
5274     if (beg > end) return 0;
5275     p = rb_enc_prev_char(beg, end, enc);
5276     if (!p) return 0;
5277     if (p > beg && rb_enc_codepoint(p, end, enc) == '\n') {
5278         p2 = rb_enc_prev_char(beg, p, enc);
5279         if (p2 && rb_enc_codepoint(p2, end, enc) == '\r') p = p2;
5280     }
5281     return p - beg;
5282 }
5283
5284 /*
5285  *  call-seq:
5286  *     str.chop!   => str or nil
5287  *
5288  *  Processes <i>str</i> as for <code>String#chop</code>, returning <i>str</i>,
5289  *  or <code>nil</code> if <i>str</i> is the empty string.  See also
5290  *  <code>String#chomp!</code>.
5291  */
5292
5293 static VALUE
5294 rb_str_chop_bang(VALUE str)
5295 {
5296     if (RSTRING_LEN(str) > 0) {
5297         long len;
5298         rb_str_modify(str);
5299         len = chopped_length(str);
5300         STR_SET_LEN(str, len);
5301         RSTRING_PTR(str)[len] = '\0';
5302         return str;
5303     }
5304     return Qnil;
5305 }
5306
5307
5308 /*
5309  *  call-seq:
5310  *     str.chop   => new_str
5311  *
5312  *  Returns a new <code>String</code> with the last character removed.  If the
5313  *  string ends with <code>\r\n</code>, both characters are removed. Applying
5314  *  <code>chop</code> to an empty string returns an empty
5315  *  string. <code>String#chomp</code> is often a safer alternative, as it leaves
5316  *  the string unchanged if it doesn't end in a record separator.
5317  *
5318  *     "string\r\n".chop   #=> "string"
5319  *     "string\n\r".chop   #=> "string\n"
5320  *     "string\n".chop     #=> "string"
5321  *     "string".chop       #=> "strin"
5322  *     "x".chop.chop       #=> ""
5323  */
5324
5325 static VALUE
5326 rb_str_chop(VALUE str)
5327 {
5328     VALUE str2 = rb_str_new5(str, RSTRING_PTR(str), chopped_length(str));
5329     rb_enc_cr_str_copy_for_substr(str2, str);
5330     OBJ_INFECT(str2, str);
5331     return str2;
5332 }
5333
5334
5335 /*
5336  *  call-seq:
5337  *     str.chomp!(separator=$/)   => str or nil
5338  *
5339  *  Modifies <i>str</i> in place as described for <code>String#chomp</code>,
5340  *  returning <i>str</i>, or <code>nil</code> if no modifications were made.
5341  */
5342
5343 static VALUE
5344 rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
5345 {
5346     rb_encoding *enc;
5347     VALUE rs;
5348     int newline;
5349     char *p, *pp, *e;
5350     long len, rslen;
5351
5352     len = RSTRING_LEN(str);
5353     if (len == 0) return Qnil;
5354     p = RSTRING_PTR(str);
5355     e = p + len;
5356     if (argc == 0) {
5357         rs = rb_rs;
5358         if (rs == rb_default_rs) {
5359           smart_chomp:
5360             rb_str_modify(str);
5361             enc = rb_enc_get(str);
5362             if (rb_enc_mbminlen(enc) > 1) {
5363                 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), enc);
5364                 if (rb_enc_is_newline(pp, e, enc)) {
5365                     e = pp;
5366                 }
5367                 pp = e - rb_enc_mbminlen(enc);
5368                 if (pp >= p) {
5369                     pp = rb_enc_left_char_head(p, pp, enc);
5370                     if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
5371                         e = pp;
5372                     }
5373                 }
5374                 if (e == RSTRING_END(str)) {
5375                     return Qnil;
5376                 }
5377                 len = e - RSTRING_PTR(str);
5378                 STR_SET_LEN(str, len);
5379             }
5380             else {
5381                 if (RSTRING_PTR(str)[len-1] == '\n') {
5382                     STR_DEC_LEN(str);
5383                     if (RSTRING_LEN(str) > 0 &&
5384                         RSTRING_PTR(str)[RSTRING_LEN(str)-1] == '\r') {
5385                         STR_DEC_LEN(str);
5386                     }
5387                 }
5388                 else if (RSTRING_PTR(str)[len-1] == '\r') {
5389                     STR_DEC_LEN(str);
5390                 }
5391                 else {
5392                     return Qnil;
5393                 }
5394             }
5395             RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
5396             return str;
5397         }
5398     }
5399     else {
5400         rb_scan_args(argc, argv, "01", &rs);
5401     }
5402     if (NIL_P(rs)) return Qnil;
5403     StringValue(rs);
5404     rslen = RSTRING_LEN(rs);
5405     if (rslen == 0) {
5406         while (len>0 && p[len-1] == '\n') {
5407             len--;
5408             if (len>0 && p[len-1] == '\r')
5409                 len--;
5410         }
5411         if (len < RSTRING_LEN(str)) {
5412             rb_str_modify(str);
5413             STR_SET_LEN(str, len);
5414             RSTRING_PTR(str)[len] = '\0';
5415             return str;
5416         }
5417         return Qnil;
5418     }
5419     if (rslen > len) return Qnil;
5420     newline = RSTRING_PTR(rs)[rslen-1];
5421     if (rslen == 1 && newline == '\n')
5422         goto smart_chomp;
5423
5424     enc = rb_enc_check(str, rs);
5425     if (is_broken_string(rs)) {
5426         return Qnil;
5427     }
5428     pp = e - rslen;
5429     if (p[len-1] == newline &&
5430         (rslen <= 1 ||
5431          memcmp(RSTRING_PTR(rs), pp, rslen) == 0)) {
5432         if (rb_enc_left_char_head(p, pp, enc) != pp)
5433             return Qnil;
5434         rb_str_modify(str);
5435         STR_SET_LEN(str, RSTRING_LEN(str) - rslen);
5436         RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
5437         return str;
5438     }
5439     return Qnil;
5440 }
5441
5442
5443 /*
5444  *  call-seq:
5445  *     str.chomp(separator=$/)   => new_str
5446  *
5447  *  Returns a new <code>String</code> with the given record separator removed
5448  *  from the end of <i>str</i> (if present). If <code>$/</code> has not been
5449  *  changed from the default Ruby record separator, then <code>chomp</code> also
5450  *  removes carriage return characters (that is it will remove <code>\n</code>,
5451  *  <code>\r</code>, and <code>\r\n</code>).
5452  *
5453  *     "hello".chomp            #=> "hello"
5454  *     "hello\n".chomp          #=> "hello"
5455  *     "hello\r\n".chomp        #=> "hello"
5456  *     "hello\n\r".chomp        #=> "hello\n"
5457  *     "hello\r".chomp          #=> "hello"
5458  *     "hello \n there".chomp   #=> "hello \n there"
5459  *     "hello".chomp("llo")     #=> "he"
5460  */
5461
5462 static VALUE
5463 rb_str_chomp(int argc, VALUE *argv, VALUE str)
5464 {
5465     str = rb_str_dup(str);
5466     rb_str_chomp_bang(argc, argv, str);
5467     return str;
5468 }
5469
5470 /*
5471  *  call-seq:
5472  *     str.lstrip!   => self or nil
5473  *
5474  *  Removes leading whitespace from <i>str</i>, returning <code>nil</code> if no
5475  *  change was made. See also <code>String#rstrip!</code> and
5476  *  <code>String#strip!</code>.
5477  *
5478  *     "  hello  ".lstrip   #=> "hello  "
5479  *     "hello".lstrip!      #=> nil
5480  */
5481
5482 static VALUE
5483 rb_str_lstrip_bang(VALUE str)
5484 {
5485     rb_encoding *enc;
5486     char *s, *t, *e;
5487
5488     rb_str_modify(str);
5489     enc = STR_ENC_GET(str);
5490     s = RSTRING_PTR(str);
5491     if (!s || RSTRING_LEN(str) == 0) return Qnil;
5492     e = t = RSTRING_END(str);
5493     /* remove spaces at head */
5494     while (s < e) {
5495         int cc = rb_enc_codepoint(s, e, enc);
5496
5497         if (!rb_enc_isspace(cc, enc)) break;
5498         s += rb_enc_codelen(cc, enc);
5499     }
5500
5501     if (s > RSTRING_PTR(str)) {
5502         rb_str_modify(str);
5503         STR_SET_LEN(str, t-s);
5504         memmove(RSTRING_PTR(str), s, RSTRING_LEN(str));
5505         RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
5506         return str;
5507     }
5508     return Qnil;
5509 }
5510
5511
5512 /*
5513  *  call-seq:
5514  *     str.lstrip   => new_str
5515  *
5516  *  Returns a copy of <i>str</i> with leading whitespace removed. See also
5517  *  <code>String#rstrip</code> and <code>String#strip</code>.
5518  *
5519  *     "  hello  ".lstrip   #=> "hello  "
5520  *     "hello".lstrip       #=> "hello"
5521  */
5522
5523 static VALUE
5524 rb_str_lstrip(VALUE str)
5525 {
5526     str = rb_str_dup(str);
5527     rb_str_lstrip_bang(str);
5528     return str;
5529 }
5530
5531
5532 /*
5533  *  call-seq:
5534  *     str.rstrip!   => self or nil
5535  *
5536  *  Removes trailing whitespace from <i>str</i>, returning <code>nil</code> if
5537  *  no change was made. See also <code>String#lstrip!</code> and
5538  *  <code>String#strip!</code>.
5539  *
5540  *     "  hello  ".rstrip   #=> "  hello"
5541  *     "hello".rstrip!      #=> nil
5542  */
5543
5544 static VALUE
5545 rb_str_rstrip_bang(VALUE str)
5546 {
5547     rb_encoding *enc;
5548     char *s, *t, *e;
5549     int space_seen = Qfalse;
5550
5551     rb_str_modify(str);
5552     enc = STR_ENC_GET(str);
5553     s = RSTRING_PTR(str);
5554     if (!s || RSTRING_LEN(str) == 0) return Qnil;
5555     t = e = RSTRING_END(str);
5556     while (s < e) {
5557         int cc = rb_enc_codepoint(s, e, enc);
5558
5559         if (!cc || rb_enc_isspace(cc, enc)) {
5560             if (!space_seen) t = s;
5561             space_seen = Qtrue;
5562         }
5563         else {
5564             space_seen = Qfalse;
5565         }
5566         s += rb_enc_codelen(cc, enc);
5567     }
5568     if (!space_seen) t = s;
5569     if (t < e) {
5570         rb_str_modify(str);
5571         STR_SET_LEN(str, t-RSTRING_PTR(str));
5572         RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
5573         return str;
5574     }
5575     return Qnil;
5576 }
5577
5578
5579 /*
5580  *  call-seq:
5581  *     str.rstrip   => new_str
5582  *
5583  *  Returns a copy of <i>str</i> with trailing whitespace removed. See also
5584  *  <code>String#lstrip</code> and <code>String#strip</code>.
5585  *
5586  *     "  hello  ".rstrip   #=> "  hello"
5587  *     "hello".rstrip       #=> "hello"
5588  */
5589
5590 static VALUE
5591 rb_str_rstrip(VALUE str)
5592 {
5593     str = rb_str_dup(str);
5594     rb_str_rstrip_bang(str);
5595     return str;
5596 }
5597
5598
5599 /*
5600  *  call-seq:
5601  *     str.strip!   => str or nil
5602  *
5603  *  Removes leading and trailing whitespace from <i>str</i>. Returns
5604  *  <code>nil</code> if <i>str</i> was not altered.
5605  */
5606
5607 static VALUE
5608 rb_str_strip_bang(VALUE str)
5609 {
5610     VALUE l = rb_str_lstrip_bang(str);
5611     VALUE r = rb_str_rstrip_bang(str);
5612
5613     if (NIL_P(l) && NIL_P(r)) return Qnil;
5614     return str;
5615 }
5616
5617
5618 /*
5619  *  call-seq:
5620  *     str.strip   => new_str
5621  *
5622  *  Returns a copy of <i>str</i> with leading and trailing whitespace removed.
5623  *
5624  *     "    hello    ".strip   #=> "hello"
5625  *     "\tgoodbye\r\n".strip   #=> "goodbye"
5626  */
5627
5628 static VALUE
5629 rb_str_strip(VALUE str)
5630 {
5631     str = rb_str_dup(str);
5632     rb_str_strip_bang(str);
5633     return str;
5634 }
5635
5636 static VALUE
5637 scan_once(VALUE str, VALUE pat, long *start)
5638 {
5639     VALUE result, match;
5640     struct re_registers *regs;
5641     long i;
5642
5643     if (rb_reg_search(pat, str, *start, 0) >= 0) {
5644         match = rb_backref_get();
5645         regs = RMATCH_REGS(match);
5646         if (BEG(0) == END(0)) {
5647             rb_encoding *enc = STR_ENC_GET(str);
5648             /*
5649              * Always consume at least one character of the input string
5650              */
5651             if (RSTRING_LEN(str) > END(0))
5652                 *start = END(0)+rb_enc_mbclen(RSTRING_PTR(str)+END(0),
5653                                               RSTRING_END(str), enc);
5654             else
5655                 *start = END(0)+1;
5656         }
5657         else {
5658             *start = END(0);
5659         }
5660         if (regs->num_regs == 1) {
5661             return rb_reg_nth_match(0, match);
5662         }
5663         result = rb_ary_new2(regs->num_regs);
5664         for (i=1; i < regs->num_regs; i++) {
5665             rb_ary_push(result, rb_reg_nth_match(i, match));
5666         }
5667
5668         return result;
5669     }
5670     return Qnil;
5671 }
5672
5673
5674 /*
5675  *  call-seq:
5676  *     str.scan(pattern)                         => array
5677  *     str.scan(pattern) {|match, ...| block }   => str
5678  *
5679  *  Both forms iterate through <i>str</i>, matching the pattern (which may be a
5680  *  <code>Regexp</code> or a <code>String</code>). For each match, a result is
5681  *  generated and either added to the result array or passed to the block. If
5682  *  the pattern contains no groups, each individual result consists of the
5683  *  matched string, <code>$&</code>.  If the pattern contains groups, each
5684  *  individual result is itself an array containing one entry per group.
5685  *
5686  *     a = "cruel world"
5687  *     a.scan(/\w+/)        #=> ["cruel", "world"]
5688  *     a.scan(/.../)        #=> ["cru", "el ", "wor"]
5689  *     a.scan(/(...)/)      #=> [["cru"], ["el "], ["wor"]]
5690  *     a.scan(/(..)(..)/)   #=> [["cr", "ue"], ["l ", "wo"]]
5691  *
5692  *  And the block form:
5693  *
5694  *     a.scan(/\w+/) {|w| print "<<#{w}>> " }
5695  *     print "\n"
5696  *     a.scan(/(.)(.)/) {|x,y| print y, x }
5697  *     print "\n"
5698  *
5699  *  <em>produces:</em>
5700  *
5701  *     <<cruel>> <<world>>
5702  *     rceu lowlr
5703  */
5704
5705 static VALUE
5706 rb_str_scan(VALUE str, VALUE pat)
5707 {
5708     VALUE result;
5709     long start = 0;
5710     long last = -1, prev = 0;
5711     char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
5712
5713     pat = get_pat(pat, 1);
5714     if (!rb_block_given_p()) {
5715         VALUE ary = rb_ary_new();
5716
5717         while (!NIL_P(result = scan_once(str, pat, &start))) {
5718             last = prev;
5719             prev = start;
5720             rb_ary_push(ary, result);
5721         }
5722         if (last >= 0) rb_reg_search(pat, str, last, 0);
5723         return ary;
5724     }
5725
5726     while (!NIL_P(result = scan_once(str, pat, &start))) {
5727         last = prev;
5728         prev = start;
5729         rb_yield(result);
5730         str_mod_check(str, p, len);
5731     }
5732     if (last >= 0) rb_reg_search(pat, str, last, 0);
5733     return str;
5734 }
5735
5736
5737 /*
5738  *  call-seq:
5739  *     str.hex   => integer
5740  *
5741  *  Treats leading characters from <i>str</i> as a string of hexadecimal digits
5742  *  (with an optional sign and an optional <code>0x</code>) and returns the
5743  *  corresponding number. Zero is returned on error.
5744  *
5745  *     "0x0a".hex     #=> 10
5746  *     "-1234".hex    #=> -4660
5747  *     "0".hex        #=> 0
5748  *     "wombat".hex   #=> 0
5749  */
5750
5751 static VALUE
5752 rb_str_hex(VALUE str)
5753 {
5754     rb_encoding *enc = rb_enc_get(str);
5755
5756     if (!rb_enc_asciicompat(enc)) {
5757         rb_raise(rb_eArgError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
5758     }
5759     return rb_str_to_inum(str, 16, Qfalse);
5760 }
5761
5762
5763 /*
5764  *  call-seq:
5765  *     str.oct   => integer
5766  *
5767  *  Treats leading characters of <i>str</i> as a string of octal digits (with an
5768  *  optional sign) and returns the corresponding number.  Returns 0 if the
5769  *  conversion fails.
5770  *
5771  *     "123".oct       #=> 83
5772  *     "-377".oct      #=> -255
5773  *     "bad".oct       #=> 0
5774  *     "0377bad".oct   #=> 255
5775  */
5776
5777 static VALUE
5778 rb_str_oct(VALUE str)
5779 {
5780     rb_encoding *enc = rb_enc_get(str);
5781
5782     if (!rb_enc_asciicompat(enc)) {
5783         rb_raise(rb_eArgError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
5784     }
5785     return rb_str_to_inum(str, -8, Qfalse);
5786 }
5787
5788
5789 /*
5790  *  call-seq:
5791  *     str.crypt(other_str)   => new_str
5792  *
5793  *  Applies a one-way cryptographic hash to <i>str</i> by invoking the standard
5794  *  library function <code>crypt</code>. The argument is the salt string, which
5795  *  should be two characters long, each character drawn from
5796  *  <code>[a-zA-Z0-9./]</code>.
5797  */
5798
5799 static VALUE
5800 rb_str_crypt(VALUE str, VALUE salt)
5801 {
5802     extern char *crypt(const char *, const char *);
5803     VALUE result;
5804     const char *s;
5805
5806     StringValue(salt);
5807     if (RSTRING_LEN(salt) < 2)
5808         rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
5809
5810     if (RSTRING_PTR(str)) s = RSTRING_PTR(str);
5811     else s = "";
5812     result = rb_str_new2(crypt(s, RSTRING_PTR(salt)));
5813     OBJ_INFECT(result, str);
5814     OBJ_INFECT(result, salt);
5815     return result;
5816 }
5817
5818
5819 /*
5820  *  call-seq:
5821  *     str.intern   => symbol
5822  *     str.to_sym   => symbol
5823  *
5824  *  Returns the <code>Symbol</code> corresponding to <i>str</i>, creating the
5825  *  symbol if it did not previously exist. See <code>Symbol#id2name</code>.
5826  *
5827  *     "Koala".intern         #=> :Koala
5828  *     s = 'cat'.to_sym       #=> :cat
5829  *     s == :cat              #=> true
5830  *     s = '@cat'.to_sym      #=> :@cat
5831  *     s == :@cat             #=> true
5832  *
5833  *  This can also be used to create symbols that cannot be represented using the
5834  *  <code>:xxx</code> notation.
5835  *
5836  *     'cat and dog'.to_sym   #=> :"cat and dog"
5837  */
5838
5839 VALUE
5840 rb_str_intern(VALUE s)
5841 {
5842     VALUE str = RB_GC_GUARD(s);
5843     ID id;
5844
5845     id = rb_intern_str(str);
5846     return ID2SYM(id);
5847 }
5848
5849
5850 /*
5851  *  call-seq:
5852  *     str.ord   => integer
5853  *
5854  *  Return the <code>Integer</code> ordinal of a one-character string.
5855  *
5856  *     "a".ord         #=> 97
5857  */
5858
5859 VALUE
5860 rb_str_ord(VALUE s)
5861 {
5862     int c;
5863
5864     c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s));
5865     return INT2NUM(c);
5866 }
5867 /*
5868  *  call-seq:
5869  *     str.sum(n=16)   => integer
5870  *
5871  *  Returns a basic <em>n</em>-bit checksum of the characters in <i>str</i>,
5872  *  where <em>n</em> is the optional <code>Fixnum</code> parameter, defaulting
5873  *  to 16. The result is simply the sum of the binary value of each character in
5874  *  <i>str</i> modulo <code>2n - 1</code>. This is not a particularly good
5875  *  checksum.
5876  */
5877
5878 static VALUE
5879 rb_str_sum(int argc, VALUE *argv, VALUE str)
5880 {
5881     VALUE vbits;
5882     int bits;
5883     char *ptr, *p, *pend;
5884     long len;
5885
5886     if (argc == 0) {
5887         bits = 16;
5888     }
5889     else {
5890         rb_scan_args(argc, argv, "01", &vbits);
5891         bits = NUM2INT(vbits);
5892     }
5893     ptr = p = RSTRING_PTR(str);
5894     len = RSTRING_LEN(str);
5895     pend = p + len;
5896     if (bits >= sizeof(long)*CHAR_BIT) {
5897         VALUE sum = INT2FIX(0);
5898
5899         while (p < pend) {
5900             str_mod_check(str, ptr, len);
5901             sum = rb_funcall(sum, '+', 1, INT2FIX((unsigned char)*p));
5902             p++;
5903         }
5904         if (bits != 0) {
5905             VALUE mod;
5906
5907             mod = rb_funcall(INT2FIX(1), rb_intern("<<"), 1, INT2FIX(bits));
5908             mod = rb_funcall(mod, '-', 1, INT2FIX(1));
5909             sum = rb_funcall(sum, '&', 1, mod);
5910         }
5911         return sum;
5912     }
5913     else {
5914        unsigned long sum = 0;
5915
5916         while (p < pend) {
5917             str_mod_check(str, ptr, len);
5918             sum += (unsigned char)*p;
5919             p++;
5920         }
5921         if (bits != 0) {
5922            sum &= (((unsigned long)1)<<bits)-1;
5923         }
5924         return rb_int2inum(sum);
5925     }
5926 }
5927
5928 static VALUE
5929 rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
5930 {
5931     rb_encoding *enc;
5932     VALUE w;
5933     long width, len, flen = 1, fclen = 1;
5934     VALUE res;
5935     char *p;
5936     const char *f = " ";
5937     long n, llen, rlen;
5938     volatile VALUE pad;
5939     int singlebyte = 1;
5940
5941     rb_scan_args(argc, argv, "11", &w, &pad);
5942     enc = STR_ENC_GET(str);
5943     width = NUM2LONG(w);
5944     if (argc == 2) {
5945         StringValue(pad);
5946         enc = rb_enc_check(str, pad);
5947         f = RSTRING_PTR(pad);
5948         flen = RSTRING_LEN(pad);
5949         fclen = str_strlen(pad, enc);
5950         singlebyte = single_byte_optimizable(pad);
5951         if (flen == 0 || fclen == 0) {
5952             rb_raise(rb_eArgError, "zero width padding");
5953         }
5954     }
5955     len = str_strlen(str, enc);
5956     if (width < 0 || len >= width) return rb_str_dup(str);
5957     n = width - len;
5958     llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
5959     rlen = n - llen;
5960     res = rb_str_new5(str, 0, RSTRING_LEN(str)+n*flen/fclen+2);
5961     p = RSTRING_PTR(res);
5962     while (llen) {
5963         if (flen <= 1) {
5964             *p++ = *f;
5965             llen--;
5966         }
5967         else if (llen > fclen) {
5968             memcpy(p,f,flen);
5969             p += flen;
5970             llen -= fclen;
5971         }
5972         else {
5973             char *fp = str_nth(f, f+flen, llen, enc, singlebyte);
5974             n = fp - f;
5975             memcpy(p,f,n);
5976             p+=n;
5977             break;
5978         }
5979     }
5980     memcpy(p, RSTRING_PTR(str), RSTRING_LEN(str));
5981     p+=RSTRING_LEN(str);
5982     while (rlen) {
5983         if (flen <= 1) {
5984             *p++ = *f;
5985             rlen--;
5986         }
5987         else if (rlen > fclen) {
5988             memcpy(p,f,flen);
5989             p += flen;
5990             rlen -= fclen;
5991         }
5992         else {
5993             char *fp = str_nth(f, f+flen, rlen, enc, singlebyte);
5994             n = fp - f;
5995             memcpy(p,f,n);
5996             p+=n;
5997             break;
5998         }
5999     }
6000     *p = '\0';
6001     STR_SET_LEN(res, p-RSTRING_PTR(res));
6002     OBJ_INFECT(res, str);
6003     if (!NIL_P(pad)) OBJ_INFECT(res, pad);
6004     rb_enc_associate(res, enc);
6005     return res;
6006 }
6007
6008
6009 /*
6010  *  call-seq:
6011  *     str.ljust(integer, padstr=' ')   => new_str
6012  *
6013  *  If <i>integer</i> is greater than the length of <i>str</i>, returns a new
6014  *  <code>String</code> of length <i>integer</i> with <i>str</i> left justified
6015  *  and padded with <i>padstr</i>; otherwise, returns <i>str</i>.
6016  *
6017  *     "hello".ljust(4)            #=> "hello"
6018  *     "hello".ljust(20)           #=> "hello               "
6019  *     "hello".ljust(20, '1234')   #=> "hello123412341234123"
6020  */
6021
6022 static VALUE
6023 rb_str_ljust(int argc, VALUE *argv, VALUE str)
6024 {
6025     return rb_str_justify(argc, argv, str, 'l');
6026 }
6027
6028
6029 /*
6030  *  call-seq:
6031  *     str.rjust(integer, padstr=' ')   => new_str
6032  *
6033  *  If <i>integer</i> is greater than the length of <i>str</i>, returns a new
6034  *  <code>String</code> of length <i>integer</i> with <i>str</i> right justified
6035  *  and padded with <i>padstr</i>; otherwise, returns <i>str</i>.
6036  *
6037  *     "hello".rjust(4)            #=> "hello"
6038  *     "hello".rjust(20)           #=> "               hello"
6039  *     "hello".rjust(20, '1234')   #=> "123412341234123hello"
6040  */
6041
6042 static VALUE
6043 rb_str_rjust(int argc, VALUE *argv, VALUE str)
6044 {
6045     return rb_str_justify(argc, argv, str, 'r');
6046 }
6047
6048
6049 /*
6050  *  call-seq:
6051  *     str.center(integer, padstr)   => new_str
6052  *
6053  *  If <i>integer</i> is greater than the length of <i>str</i>, returns a new
6054  *  <code>String</code> of length <i>integer</i> with <i>str</i> centered and
6055  *  padded with <i>padstr</i>; otherwise, returns <i>str</i>.
6056  *
6057  *     "hello".center(4)         #=> "hello"
6058  *     "hello".center(20)        #=> "       hello        "
6059  *     "hello".center(20, '123') #=> "1231231hello12312312"
6060  */
6061
6062 static VALUE
6063 rb_str_center(int argc, VALUE *argv, VALUE str)
6064 {
6065     return rb_str_justify(argc, argv, str, 'c');
6066 }
6067
6068 /*
6069  *  call-seq:
6070  *     str.partition(sep)              => [head, sep, tail]
6071  *
6072  *  Searches the string for <i>sep</i> and returns the part before
6073  *  it, the <i>sep</i>, and the part after it.  If <i>sep</i> is not found,
6074  *  returns <i>str</i> and two empty strings.
6075  *
6076  *     "hello".partition("l")         #=> ["he", "l", "lo"]
6077  *     "hello".partition("x")         #=> ["hello", "", ""]
6078  */
6079
6080 static VALUE
6081 rb_str_partition(VALUE str, VALUE sep)
6082 {
6083     long pos;
6084     int regex = Qfalse;
6085
6086     if (TYPE(sep) == T_REGEXP) {
6087         pos = rb_reg_search(sep, str, 0, 0);
6088         regex = Qtrue;
6089     }
6090     else {
6091         VALUE tmp;
6092
6093         tmp = rb_check_string_type(sep);
6094         if (NIL_P(tmp)) {
6095             rb_raise(rb_eTypeError, "type mismatch: %s given",
6096                      rb_obj_classname(sep));
6097         }
6098         pos = rb_str_index(str, sep, 0);
6099     }
6100     if (pos < 0) {
6101       failed:
6102         return rb_ary_new3(3, str, rb_str_new(0,0),rb_str_new(0,0));
6103     }
6104     if (regex) {
6105         sep = rb_str_subpat(str, sep, 0);
6106         if (pos == 0 && RSTRING_LEN(sep) == 0) goto failed;
6107     }
6108     return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
6109                           sep,
6110                           rb_str_subseq(str, pos+RSTRING_LEN(sep),
6111                                              RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
6112 }
6113
6114 /*
6115  *  call-seq:
6116  *     str.rpartition(sep)            => [head, sep, tail]
6117  *
6118  *  Searches <i>sep</i> in the string from the end of the string, and
6119  *  returns the part before it, the <i>sep</i>, and the part after it.
6120  *  If <i>sep</i> is not found, returns two empty strings and
6121  *  <i>str</i>.
6122  *
6123  *     "hello".rpartition("l")         #=> ["hel", "l", "o"]
6124  *     "hello".rpartition("x")         #=> ["", "", "hello"]
6125  */
6126
6127 static VALUE
6128 rb_str_rpartition(VALUE str, VALUE sep)
6129 {
6130     long pos = RSTRING_LEN(str);
6131     int regex = Qfalse;
6132
6133     if (TYPE(sep) == T_REGEXP) {
6134         pos = rb_reg_search(sep, str, pos, 1);
6135         regex = Qtrue;
6136     }
6137     else {
6138         VALUE tmp;
6139
6140         tmp = rb_check_string_type(sep);
6141         if (NIL_P(tmp)) {
6142             rb_raise(rb_eTypeError, "type mismatch: %s given",
6143                      rb_obj_classname(sep));
6144         }
6145         pos = rb_str_sublen(str, pos);
6146         pos = rb_str_rindex(str, sep, pos);
6147     }
6148     if (pos < 0) {
6149         return rb_ary_new3(3, rb_str_new(0,0),rb_str_new(0,0), str);
6150     }
6151     if (regex) {
6152         sep = rb_reg_nth_match(0, rb_backref_get());
6153     }
6154     return rb_ary_new3(3, rb_str_substr(str, 0, pos),
6155                           sep,
6156                           rb_str_substr(str,pos+str_strlen(sep,STR_ENC_GET(sep)),RSTRING_LEN(str)));
6157 }
6158
6159 /*
6160  *  call-seq:
6161  *     str.start_with?([prefix]+)   => true or false
6162  *
6163  *  Returns true if <i>str</i> starts with the prefix given.
6164  */
6165
6166 static VALUE
6167 rb_str_start_with(int argc, VALUE *argv, VALUE str)
6168 {
6169     int i;
6170
6171     for (i=0; i<argc; i++) {
6172         VALUE tmp = rb_check_string_type(argv[i]);
6173         if (NIL_P(tmp)) continue;
6174         rb_enc_check(str, tmp);
6175         if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
6176         if (memcmp(RSTRING_PTR(str), RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
6177             return Qtrue;
6178     }
6179     return Qfalse;
6180 }
6181
6182 /*
6183  *  call-seq:
6184  *     str.end_with?([suffix]+)   => true or false
6185  *
6186  *  Returns true if <i>str</i> ends with the suffix given.
6187  */
6188
6189 static VALUE
6190 rb_str_end_with(int argc, VALUE *argv, VALUE str)
6191 {
6192     int i;
6193     char *p, *s;
6194     rb_encoding *enc;
6195
6196     for (i=0; i<argc; i++) {
6197         VALUE tmp = rb_check_string_type(argv[i]);
6198         if (NIL_P(tmp)) continue;
6199         enc = rb_enc_check(str, tmp);
6200         if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
6201         p = RSTRING_PTR(str);
6202         s = p + RSTRING_LEN(str) - RSTRING_LEN(tmp);
6203         if (rb_enc_left_char_head(p, s, enc) != s)
6204             continue;
6205         if (memcmp(s, RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
6206             return Qtrue;
6207     }
6208     return Qfalse;
6209 }
6210
6211 void
6212 rb_str_setter(VALUE val, ID id, VALUE *var)
6213 {
6214     if (!NIL_P(val) && TYPE(val) != T_STRING) {
6215         rb_raise(rb_eTypeError, "value of %s must be String", rb_id2name(id));
6216     }
6217     *var = val;
6218 }
6219
6220
6221 /*
6222  *  call-seq:
6223  *     str.force_encoding(encoding)   => str
6224  *
6225  *  Changes the encoding to +encoding+ and returns self.
6226  */
6227
6228 static VALUE
6229 rb_str_force_encoding(VALUE str, VALUE enc)
6230 {
6231     str_modifiable(str);
6232     rb_enc_associate(str, rb_to_encoding(enc));
6233     return str;
6234 }
6235
6236 /*
6237  *  call-seq:
6238  *     str.valid_encoding?  => true or false
6239  *
6240  *  Returns true for a string which encoded correctly.
6241  *
6242  *    "\xc2\xa1".force_encoding("UTF-8").valid_encoding? => true
6243  *    "\xc2".force_encoding("UTF-8").valid_encoding? => false
6244  *    "\x80".force_encoding("UTF-8").valid_encoding? => false
6245  */
6246
6247 static VALUE
6248 rb_str_valid_encoding_p(VALUE str)
6249 {
6250     int cr = rb_enc_str_coderange(str);
6251
6252     return cr == ENC_CODERANGE_BROKEN ? Qfalse : Qtrue;
6253 }
6254
6255 /*
6256  *  call-seq:
6257  *     str.ascii_only?  => true or false
6258  *
6259  *  Returns true for a string which has only ASCII characters.
6260  *
6261  *    "abc".force_encoding("UTF-8").ascii_only? => true
6262  *    "abc\u{6666}".force_encoding("UTF-8").ascii_only? => false
6263  */
6264
6265 static VALUE
6266 rb_str_is_ascii_only_p(VALUE str)
6267 {
6268     int cr = rb_enc_str_coderange(str);
6269
6270     return cr == ENC_CODERANGE_7BIT ? Qtrue : Qfalse;
6271 }
6272
6273 /**********************************************************************
6274  * Document-class: Symbol
6275  *
6276  *  <code>Symbol</code> objects represent names and some strings
6277  *  inside the Ruby
6278  *  interpreter. They are generated using the <code>:name</code> and
6279  *  <code>:"string"</code> literals
6280  *  syntax, and by the various <code>to_sym</code> methods. The same
6281  *  <code>Symbol</code> object will be created for a given name or string
6282  *  for the duration of a program's execution, regardless of the context
6283  *  or meaning of that name. Thus if <code>Fred</code> is a constant in
6284  *  one context, a method in another, and a class in a third, the
6285  *  <code>Symbol</code> <code>:Fred</code> will be the same object in
6286  *  all three contexts.
6287  *
6288  *     module One
6289  *       class Fred
6290  *       end
6291  *       $f1 = :Fred
6292  *     end
6293  *     module Two
6294  *       Fred = 1
6295  *       $f2 = :Fred
6296  *     end
6297  *     def Fred()
6298  *     end
6299  *     $f3 = :Fred
6300  *     $f1.object_id   #=> 2514190
6301  *     $f2.object_id   #=> 2514190
6302  *     $f3.object_id   #=> 2514190
6303  *
6304  */
6305
6306
6307 /*
6308  *  call-seq:
6309  *     sym == obj   => true or false
6310  *
6311  *  Equality---If <i>sym</i> and <i>obj</i> are exactly the same
6312  *  symbol, returns <code>true</code>. Otherwise, compares them
6313  *  as strings.
6314  */
6315
6316 static VALUE
6317 sym_equal(VALUE sym1, VALUE sym2)
6318 {
6319     if (sym1 == sym2) return Qtrue;
6320     return Qfalse;
6321 }
6322
6323
6324 /*
6325  *  call-seq:
6326  *     sym.inspect    => string
6327  *
6328  *  Returns the representation of <i>sym</i> as a symbol literal.
6329  *
6330  *     :fred.inspect   #=> ":fred"
6331  */
6332
6333 static VALUE
6334 sym_inspect(VALUE sym)
6335 {
6336     VALUE str;
6337     ID id = SYM2ID(sym);
6338     rb_encoding *enc;
6339
6340     sym = rb_id2str(id);
6341     enc = STR_ENC_GET(sym);
6342     str = rb_enc_str_new(0, RSTRING_LEN(sym)+1, enc);
6343     RSTRING_PTR(str)[0] = ':';
6344     memcpy(RSTRING_PTR(str)+1, RSTRING_PTR(sym), RSTRING_LEN(sym));
6345     if (RSTRING_LEN(sym) != strlen(RSTRING_PTR(sym)) ||
6346         !rb_enc_symname_p(RSTRING_PTR(sym), enc)) {
6347         str = rb_str_inspect(str);
6348         strncpy(RSTRING_PTR(str), ":\"", 2);
6349     }
6350     return str;
6351 }
6352
6353
6354 /*
6355  *  call-seq:
6356  *     sym.id2name   => string
6357  *     sym.to_s      => string
6358  *
6359  *  Returns the name or string corresponding to <i>sym</i>.
6360  *
6361  *     :fred.id2name   #=> "fred"
6362  */
6363
6364
6365 VALUE
6366 rb_sym_to_s(VALUE sym)
6367 {
6368     ID id = SYM2ID(sym);
6369
6370     return str_new3(rb_cString, rb_id2str(id));
6371 }
6372
6373
6374 /*
6375  * call-seq:
6376  *   sym.to_sym   => sym
6377  *   sym.intern   => sym
6378  *
6379  * In general, <code>to_sym</code> returns the <code>Symbol</code> corresponding
6380  * to an object. As <i>sym</i> is already a symbol, <code>self</code> is returned
6381  * in this case.
6382  */
6383
6384 static VALUE
6385 sym_to_sym(VALUE sym)
6386 {
6387     return sym;
6388 }
6389
6390 static VALUE
6391 sym_call(VALUE args, VALUE sym, int argc, VALUE *argv)
6392 {
6393     VALUE obj;
6394
6395     if (argc < 1) {
6396         rb_raise(rb_eArgError, "no receiver given");
6397     }
6398     obj = argv[0];
6399     return rb_funcall3(obj, (ID)sym, argc - 1, argv + 1);
6400 }
6401
6402 /*
6403  * call-seq:
6404  *   sym.to_proc
6405  *
6406  * Returns a _Proc_ object which respond to the given method by _sym_.
6407  *
6408  *   (1..3).collect(&:to_s)  #=> ["1", "2", "3"]
6409  */
6410
6411 static VALUE
6412 sym_to_proc(VALUE sym)
6413 {
6414     return rb_proc_new(sym_call, (VALUE)SYM2ID(sym));
6415 }
6416
6417
6418 static VALUE
6419 sym_succ(VALUE sym)
6420 {
6421     return rb_str_intern(rb_str_succ(rb_sym_to_s(sym)));
6422 }
6423
6424 static VALUE
6425 sym_cmp(VALUE sym, VALUE other)
6426 {
6427     if (!SYMBOL_P(other)) {
6428         return Qnil;
6429     }
6430     return rb_str_cmp_m(rb_sym_to_s(sym), rb_sym_to_s(other));
6431 }
6432
6433 static VALUE
6434 sym_casecmp(VALUE sym, VALUE other)
6435 {
6436     if (!SYMBOL_P(other)) {
6437         return Qnil;
6438     }
6439     return rb_str_casecmp(rb_sym_to_s(sym), rb_sym_to_s(other));
6440 }
6441
6442 static VALUE
6443 sym_match(VALUE sym, VALUE other)
6444 {
6445     return rb_str_match(rb_sym_to_s(sym), other);
6446 }
6447
6448 static VALUE
6449 sym_eqq(VALUE sym, VALUE other)
6450 {
6451     if (sym == other) return Qtrue;
6452     return rb_str_equal(rb_sym_to_s(sym), other);
6453 }
6454
6455 static VALUE
6456 sym_aref(int argc, VALUE *argv, VALUE sym)
6457 {
6458     return rb_str_aref_m(argc, argv, rb_sym_to_s(sym));
6459 }
6460
6461 static VALUE
6462 sym_length(VALUE sym)
6463 {
6464     return rb_str_length(rb_id2str(SYM2ID(sym)));
6465 }
6466
6467 static VALUE
6468 sym_empty(VALUE sym)
6469 {
6470     return rb_str_empty(rb_id2str(SYM2ID(sym)));
6471 }
6472
6473 static VALUE
6474 sym_upcase(VALUE sym)
6475 {
6476     return rb_str_intern(rb_str_upcase(rb_id2str(SYM2ID(sym))));
6477 }
6478
6479 static VALUE
6480 sym_downcase(VALUE sym)
6481 {
6482     return rb_str_intern(rb_str_downcase(rb_id2str(SYM2ID(sym))));
6483 }
6484
6485 static VALUE
6486 sym_capitalize(VALUE sym)
6487 {
6488     return rb_str_intern(rb_str_capitalize(rb_id2str(SYM2ID(sym))));
6489 }
6490
6491 static VALUE
6492 sym_swapcase(VALUE sym)
6493 {
6494     return rb_str_intern(rb_str_swapcase(rb_id2str(SYM2ID(sym))));
6495 }
6496
6497 static VALUE
6498 sym_encoding(VALUE sym)
6499 {
6500     return rb_obj_encoding(rb_id2str(SYM2ID(sym)));
6501 }
6502
6503 ID
6504 rb_to_id(VALUE name)
6505 {
6506     VALUE tmp;
6507     ID id;
6508
6509     switch (TYPE(name)) {
6510       default:
6511         tmp = rb_check_string_type(name);
6512         if (NIL_P(tmp)) {
6513             rb_raise(rb_eTypeError, "%s is not a symbol",
6514                      RSTRING_PTR(rb_inspect(name)));
6515         }
6516         name = tmp;
6517         /* fall through */
6518       case T_STRING:
6519         name = rb_str_intern(name);
6520         /* fall through */
6521       case T_SYMBOL:
6522         return SYM2ID(name);
6523     }
6524     return id;
6525 }
6526
6527 /*
6528  *  A <code>String</code> object holds and manipulates an arbitrary sequence of
6529  *  bytes, typically representing characters. String objects may be created
6530  *  using <code>String::new</code> or as literals.
6531  *
6532  *  Because of aliasing issues, users of strings should be aware of the methods
6533  *  that modify the contents of a <code>String</code> object.  Typically,
6534  *  methods with names ending in ``!'' modify their receiver, while those
6535  *  without a ``!'' return a new <code>String</code>.  However, there are
6536  *  exceptions, such as <code>String#[]=</code>.
6537  *
6538  */
6539
6540 void
6541 Init_String(void)
6542 {
6543 #undef rb_intern
6544
6545     rb_cString  = rb_define_class("String", rb_cObject);
6546     rb_include_module(rb_cString, rb_mComparable);
6547     rb_define_alloc_func(rb_cString, str_alloc);
6548     rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1);
6549     rb_define_method(rb_cString, "initialize", rb_str_init, -1);
6550     rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
6551     rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1);
6552     rb_define_method(rb_cString, "==", rb_str_equal, 1);
6553     rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
6554     rb_define_method(rb_cString, "hash", rb_str_hash_m, 0);
6555     rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
6556     rb_define_method(rb_cString, "+", rb_str_plus, 1);
6557     rb_define_method(rb_cString, "*", rb_str_times, 1);
6558     rb_define_method(rb_cString, "%", rb_str_format_m, 1);
6559     rb_define_method(rb_cString, "[]", rb_str_aref_m, -1);
6560     rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1);
6561     rb_define_method(rb_cString, "insert", rb_str_insert, 2);
6562     rb_define_method(rb_cString, "length", rb_str_length, 0);
6563     rb_define_method(rb_cString, "size", rb_str_length, 0);
6564     rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
6565     rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
6566     rb_define_method(rb_cString, "=~", rb_str_match, 1);
6567     rb_define_method(rb_cString, "match", rb_str_match_m, -1);
6568     rb_define_method(rb_cString, "succ", rb_str_succ, 0);
6569     rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0);
6570     rb_define_method(rb_cString, "next", rb_str_succ, 0);
6571     rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0);
6572     rb_define_method(rb_cString, "upto", rb_str_upto, -1);
6573     rb_define_method(rb_cString, "index", rb_str_index_m, -1);
6574     rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1);
6575     rb_define_method(rb_cString, "replace", rb_str_replace, 1);
6576     rb_define_method(rb_cString, "clear", rb_str_clear, 0);
6577     rb_define_method(rb_cString, "chr", rb_str_chr, 0);
6578     rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
6579     rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
6580
6581     rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
6582     rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
6583     rb_define_method(rb_cString, "to_s", rb_str_to_s, 0);
6584     rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
6585     rb_define_method(rb_cString, "inspect", rb_str_inspect, 0);
6586     rb_define_method(rb_cString, "dump", rb_str_dump, 0);
6587
6588     rb_define_method(rb_cString, "upcase", rb_str_upcase, 0);
6589     rb_define_method(rb_cString, "downcase", rb_str_downcase, 0);
6590     rb_define_method(rb_cString, "capitalize", rb_str_capitalize, 0);
6591     rb_define_method(rb_cString, "swapcase", rb_str_swapcase, 0);
6592
6593     rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, 0);
6594     rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, 0);
6595     rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, 0);
6596     rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, 0);
6597
6598     rb_define_method(rb_cString, "hex", rb_str_hex, 0);
6599     rb_define_method(rb_cString, "oct", rb_str_oct, 0);
6600     rb_define_method(rb_cString, "split", rb_str_split_m, -1);
6601     rb_define_method(rb_cString, "lines", rb_str_each_line, -1);
6602     rb_define_method(rb_cString, "bytes", rb_str_each_byte, 0);
6603     rb_define_method(rb_cString, "chars", rb_str_each_char, 0);
6604     rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
6605     rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
6606     rb_define_method(rb_cString, "concat", rb_str_concat, 1);
6607     rb_define_method(rb_cString, "<<", rb_str_concat, 1);
6608     rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
6609     rb_define_method(rb_cString, "intern", rb_str_intern, 0);
6610     rb_define_method(rb_cString, "to_sym", rb_str_intern, 0);
6611     rb_define_method(rb_cString, "ord", rb_str_ord, 0);
6612
6613     rb_define_method(rb_cString, "include?", rb_str_include, 1);
6614     rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
6615     rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
6616
6617     rb_define_method(rb_cString, "scan", rb_str_scan, 1);
6618
6619     rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
6620     rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
6621     rb_define_method(rb_cString, "center", rb_str_center, -1);
6622
6623     rb_define_method(rb_cString, "sub", rb_str_sub, -1);
6624     rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
6625     rb_define_method(rb_cString, "chop", rb_str_chop, 0);
6626     rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
6627     rb_define_method(rb_cString, "strip", rb_str_strip, 0);
6628     rb_define_method(rb_cString, "lstrip", rb_str_lstrip, 0);
6629     rb_define_method(rb_cString, "rstrip", rb_str_rstrip, 0);
6630
6631     rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1);
6632     rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1);
6633     rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0);
6634     rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1);
6635     rb_define_method(rb_cString, "strip!", rb_str_strip_bang, 0);
6636     rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, 0);
6637     rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, 0);
6638
6639     rb_define_method(rb_cString, "tr", rb_str_tr, 2);
6640     rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2);
6641     rb_define_method(rb_cString, "delete", rb_str_delete, -1);
6642     rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
6643     rb_define_method(rb_cString, "count", rb_str_count, -1);
6644
6645     rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2);
6646     rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2);
6647     rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1);
6648     rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1);
6649
6650     rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
6651     rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
6652     rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
6653
6654     rb_define_method(rb_cString, "sum", rb_str_sum, -1);
6655
6656     rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
6657     rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1);
6658
6659     rb_define_method(rb_cString, "partition", rb_str_partition, 1);
6660     rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
6661
6662     rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */
6663     rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
6664     rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
6665     rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0);
6666
6667     id_to_s = rb_intern("to_s");
6668
6669     rb_fs = Qnil;
6670     rb_define_variable("$;", &rb_fs);
6671     rb_define_variable("$-F", &rb_fs);
6672
6673     rb_cSymbol = rb_define_class("Symbol", rb_cObject);
6674     rb_include_module(rb_cSymbol, rb_mComparable);
6675     rb_undef_alloc_func(rb_cSymbol);
6676     rb_undef_method(CLASS_OF(rb_cSymbol), "new");
6677     rb_define_singleton_method(rb_cSymbol, "all_symbols", rb_sym_all_symbols, 0); /* in parse.y */
6678
6679     rb_define_method(rb_cSymbol, "==", sym_equal, 1);
6680     rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
6681     rb_define_method(rb_cSymbol, "to_s", rb_sym_to_s, 0);
6682     rb_define_method(rb_cSymbol, "id2name", rb_sym_to_s, 0);
6683     rb_define_method(rb_cSymbol, "intern", sym_to_sym, 0);
6684     rb_define_method(rb_cSymbol, "to_sym", sym_to_sym, 0);
6685     rb_define_method(rb_cSymbol, "to_proc", sym_to_proc, 0);
6686     rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
6687     rb_define_method(rb_cSymbol, "next", sym_succ, 0);
6688
6689     rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
6690     rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
6691     rb_define_method(rb_cSymbol, "=~", sym_match, 1);
6692     rb_define_method(rb_cSymbol, "===", sym_eqq, 1);
6693
6694     rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
6695     rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
6696     rb_define_method(rb_cSymbol, "length", sym_length, 0);
6697     rb_define_method(rb_cSymbol, "size", sym_length, 0);
6698     rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
6699     rb_define_method(rb_cSymbol, "match", sym_match, 1);
6700
6701     rb_define_method(rb_cSymbol, "upcase", sym_upcase, 0);
6702     rb_define_method(rb_cSymbol, "downcase", sym_downcase, 0);
6703     rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, 0);
6704     rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, 0);
6705
6706     rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);
6707 }