string.c

   1 /**********************************************************************
   2
   3   string.c -
   4
   5   $Author$
   6   created at: Mon Aug  9 17:12:58 JST 1993
   7
   8   Copyright (C) 1993-2007 Yukihiro Matsumoto
   9   Copyright (C) 2000  Network Applied Communication Laboratory, Inc.
  10   Copyright (C) 2000  Information-technology Promotion Agency, Japan
  11
  12 **********************************************************************/
  13
  14 #include "ruby/ruby.h"
  15 #include "ruby/re.h"
  16 #include "ruby/encoding.h"
  17
  18 #define BEG(no) regs->beg[no]
  19 #define END(no) regs->end[no]
  20
  21 #include <math.h>
  22 #include <ctype.h>
  23
  24 #ifdef HAVE_UNISTD_H
  25 #include <unistd.h>
  26 #endif
  27
  28 #undef rb_str_new_cstr
  29 #undef rb_tainted_str_new_cstr
  30 #undef rb_usascii_str_new_cstr
  31 #undef rb_str_new2
  32 #undef rb_str_new3
  33 #undef rb_str_new4
  34 #undef rb_str_new5
  35 #undef rb_tainted_str_new2
  36 #undef rb_usascii_str_new2
  37 #undef rb_str_dup_frozen
  38 #undef rb_str_buf_new_cstr
  39 #undef rb_str_buf_new2
  40 #undef rb_str_buf_cat2
  41 #undef rb_str_cat2
  42
  43 VALUE rb_cString;
  44 VALUE rb_cSymbol;
  45
  46 #ifdef __GNUC__
  47 #define alias_func(old_prot, new_name, args) \
  48 VALUE old_prot __attribute__((alias(#new_name)));
  49 #else
  50 #define alias_func(old_prot, new_name, args) \
  51 VALUE old_prot {return new_name args;}
  52 #endif
  53
  54 #define STR_TMPLOCK FL_USER7
  55 #define STR_NOEMBED FL_USER1
  56 #define STR_SHARED  FL_USER2 /* = ELTS_SHARED */
  57 #define STR_ASSOC   FL_USER3
  58 #define STR_SHARED_P(s) FL_ALL(s, STR_NOEMBED|ELTS_SHARED)
  59 #define STR_ASSOC_P(s)  FL_ALL(s, STR_NOEMBED|STR_ASSOC)
  60 #define STR_NOCAPA  (STR_NOEMBED|ELTS_SHARED|STR_ASSOC)
  61 #define STR_NOCAPA_P(s) (FL_TEST(s,STR_NOEMBED) && FL_ANY(s,ELTS_SHARED|STR_ASSOC))
  62 #define STR_UNSET_NOCAPA(s) do {\
  63     if (FL_TEST(s,STR_NOEMBED)) FL_UNSET(s,(ELTS_SHARED|STR_ASSOC));\
  64 } while (0)
  65
  66
  67 #define STR_SET_NOEMBED(str) do {\
  68     FL_SET(str, STR_NOEMBED);\
  69     STR_SET_EMBED_LEN(str, 0);\
  70 } while (0)
  71 #define STR_SET_EMBED(str) FL_UNSET(str, STR_NOEMBED)
  72 #define STR_EMBED_P(str) (!FL_TEST(str, STR_NOEMBED))
  73 #define STR_SET_EMBED_LEN(str, n) do { \
  74     long tmp_n = (n);\
  75     RBASIC(str)->flags &= ~RSTRING_EMBED_LEN_MASK;\
  76     RBASIC(str)->flags |= (tmp_n) << RSTRING_EMBED_LEN_SHIFT;\
  77 } while (0)
  78
  79 #define STR_SET_LEN(str, n) do { \
  80     if (STR_EMBED_P(str)) {\
  81         STR_SET_EMBED_LEN(str, n);\
  82     }\
  83     else {\
  84         RSTRING(str)->as.heap.len = (n);\
  85     }\
  86 } while (0)
  87
  88 #define STR_DEC_LEN(str) do {\
  89     if (STR_EMBED_P(str)) {\
  90         long n = RSTRING_LEN(str);\
  91         n--;\
  92         STR_SET_EMBED_LEN(str, n);\
  93     }\
  94     else {\
  95         RSTRING(str)->as.heap.len--;\
  96     }\
  97 } while (0)
  98
  99 #define RESIZE_CAPA(str,capacity) do {\
 100     if (STR_EMBED_P(str)) {\
 101         if ((capacity) > RSTRING_EMBED_LEN_MAX) {\
 102             char *tmp = ALLOC_N(char, capacity+1);\
 103             memcpy(tmp, RSTRING_PTR(str), RSTRING_LEN(str));\
 104             RSTRING(str)->as.heap.ptr = tmp;\
 105             RSTRING(str)->as.heap.len = RSTRING_LEN(str);\
 106             STR_SET_NOEMBED(str);\
 107             RSTRING(str)->as.heap.aux.capa = (capacity);\
 108         }\
 109     }\
 110     else {\
 111         REALLOC_N(RSTRING(str)->as.heap.ptr, char, (capacity)+1);\
 112         if (!STR_NOCAPA_P(str))\
 113             RSTRING(str)->as.heap.aux.capa = (capacity);\
 114     }\
 115 } while (0)
 116
 117 #define is_ascii_string(str) (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT)
 118 #define is_broken_string(str) (rb_enc_str_coderange(str) == ENC_CODERANGE_BROKEN)
 119
 120 #define STR_ENC_GET(str) rb_enc_from_index(ENCODING_GET(str))
 121
 122 static int
 123 single_byte_optimizable(VALUE str)
 124 {
 125     rb_encoding *enc = STR_ENC_GET(str);
 126
 127     if (rb_enc_mbmaxlen(enc) == 1)
 128         return 1;
 129
 130     /* Conservative.  It may be ENC_CODERANGE_UNKNOWN. */
 131     if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT)
 132         return 1;
 133
 134     /* Conservative.  Possibly single byte.
 135      * "\xa1" in Shift_JIS for example. */
 136     return 0;
 137 }
 138
 139 VALUE rb_fs;
 140
 141 static inline const char *
 142 search_nonascii(const char *p, const char *e)
 143 {
 144 #if SIZEOF_VALUE == 8
 145 # define NONASCII_MASK 0x8080808080808080LL
 146 #elif SIZEOF_VALUE == 4
 147 # define NONASCII_MASK 0x80808080UL
 148 #endif
 149 #ifdef NONASCII_MASK
 150     if (sizeof(VALUE) * 2 < e - p) {
 151         const VALUE *s, *t;
 152         const VALUE lowbits = sizeof(VALUE) - 1;
 153         s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
 154         while (p < (const char *)s) {
 155             if (!ISASCII(*p))
 156                 return p;
 157             p++;
 158         }
 159         t = (const VALUE*)(~lowbits & (VALUE)e);
 160         while (s < t) {
 161             if (*s & NONASCII_MASK) {
 162                 t = s;
 163                 break;
 164             }
 165             s++;
 166         }
 167         p = (const char *)t;
 168     }
 169 #endif
 170     while (p < e) {
 171         if (!ISASCII(*p))
 172             return p;
 173         p++;
 174     }
 175     return NULL;
 176 }
 177
 178 static int
 179 coderange_scan(const char *p, long len, rb_encoding *enc)
 180 {
 181     const char *e = p + len;
 182
 183     if (rb_enc_to_index(enc) == 0) {
 184         /* enc is ASCII-8BIT.  ASCII-8BIT string never be broken. */
 185         p = search_nonascii(p, e);
 186         return p ? ENC_CODERANGE_VALID : ENC_CODERANGE_7BIT;
 187     }
 188
 189     if (rb_enc_asciicompat(enc)) {
 190         p = search_nonascii(p, e);
 191         if (!p) {
 192             return ENC_CODERANGE_7BIT;
 193         }
 194         while (p < e) {
 195             int ret = rb_enc_precise_mbclen(p, e, enc);
 196             if (!MBCLEN_CHARFOUND_P(ret)) {
 197                 return ENC_CODERANGE_BROKEN;
 198             }
 199             p += MBCLEN_CHARFOUND_LEN(ret);
 200             if (p < e) {
 201                 p = search_nonascii(p, e);
 202                 if (!p) {
 203                     return ENC_CODERANGE_VALID;
 204                 }
 205             }
 206         }
 207         if (e < p) {
 208             return ENC_CODERANGE_BROKEN;
 209         }
 210         return ENC_CODERANGE_VALID;
 211     }
 212
 213     while (p < e) {
 214         int ret = rb_enc_precise_mbclen(p, e, enc);
 215
 216         if (!MBCLEN_CHARFOUND_P(ret)) {
 217             return ENC_CODERANGE_BROKEN;
 218         }
 219         p += MBCLEN_CHARFOUND_LEN(ret);
 220     }
 221     if (e < p) {
 222         return ENC_CODERANGE_BROKEN;
 223     }
 224     return ENC_CODERANGE_VALID;
 225 }
 226
 227 long
 228 rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
 229 {
 230     const char *p = s;
 231
 232     if (*cr == ENC_CODERANGE_BROKEN)
 233         return e - s;
 234
 235     if (rb_enc_to_index(enc) == 0) {
 236         /* enc is ASCII-8BIT.  ASCII-8BIT string never be broken. */
 237         p = search_nonascii(p, e);
 238         *cr = (!p && *cr != ENC_CODERANGE_VALID) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
 239         return e - s;
 240     }
 241     else if (rb_enc_asciicompat(enc)) {
 242         p = search_nonascii(p, e);
 243         if (!p) {
 244             if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
 245             return e - s;
 246         }
 247         while (p < e) {
 248             int ret = rb_enc_precise_mbclen(p, e, enc);
 249             if (!MBCLEN_CHARFOUND_P(ret)) {
 250                 *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN;
 251                 return p - s;
 252             }
 253             p += MBCLEN_CHARFOUND_LEN(ret);
 254             if (p < e) {
 255                 p = search_nonascii(p, e);
 256                 if (!p) {
 257                     *cr = ENC_CODERANGE_VALID;
 258                     return e - s;
 259                 }
 260             }
 261         }
 262         *cr = e < p ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_VALID;
 263         return p - s;
 264     }
 265     else {
 266         while (p < e) {
 267             int ret = rb_enc_precise_mbclen(p, e, enc);
 268             if (!MBCLEN_CHARFOUND_P(ret)) {
 269                 *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN;
 270                 return p - s;
 271             }
 272             p += MBCLEN_CHARFOUND_LEN(ret);
 273         }
 274         *cr = e < p ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_VALID;
 275         return p - s;
 276     }
 277 }
 278
 279 static inline void
 280 str_enc_copy(VALUE str1, VALUE str2)
 281 {
 282     rb_enc_set_index(str1, ENCODING_GET(str2));
 283 }
 284
 285 static void
 286 rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
 287 {
 288     /* this function is designed for copying encoding and coderange
 289      * from src to new string "dest" which is made from the part of src.
 290      */
 291     str_enc_copy(dest, src);
 292     switch (ENC_CODERANGE(src)) {
 293       case ENC_CODERANGE_7BIT:
 294         ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
 295         break;
 296       case ENC_CODERANGE_VALID:
 297         if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
 298             search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
 299             ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID);
 300         else
 301             ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
 302         break;
 303       default:
 304         if (RSTRING_LEN(dest) == 0) {
 305             if (!rb_enc_asciicompat(STR_ENC_GET(src)))
 306                 ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID);
 307             else
 308                 ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
 309         }
 310         break;
 311     }
 312 }
 313
 314 static void
 315 rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
 316 {
 317     str_enc_copy(dest, src);
 318     ENC_CODERANGE_SET(dest, ENC_CODERANGE(src));
 319 }
 320
 321 int
 322 rb_enc_str_coderange(VALUE str)
 323 {
 324     int cr = ENC_CODERANGE(str);
 325
 326     if (cr == ENC_CODERANGE_UNKNOWN) {
 327         rb_encoding *enc = STR_ENC_GET(str);
 328         cr = coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
 329         ENC_CODERANGE_SET(str, cr);
 330     }
 331     return cr;
 332 }
 333
 334 int
 335 rb_enc_str_asciionly_p(VALUE str)
 336 {
 337     rb_encoding *enc = STR_ENC_GET(str);
 338
 339     if (!rb_enc_asciicompat(enc))
 340         return Qfalse;
 341     else if (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT)
 342         return Qtrue;
 343     return Qfalse;
 344 }
 345
 346 static inline void
 347 str_mod_check(VALUE s, const char *p, long len)
 348 {
 349     if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
 350         rb_raise(rb_eRuntimeError, "string modified");
 351     }
 352 }
 353
 354 static inline void
 355 str_frozen_check(VALUE s)
 356 {
 357     if (OBJ_FROZEN(s)) {
 358         rb_raise(rb_eRuntimeError, "string frozen");
 359     }
 360 }
 361
 362 size_t
 363 rb_str_capacity(VALUE str)
 364 {
 365     if (STR_EMBED_P(str)) {
 366         return RSTRING_EMBED_LEN_MAX;
 367     }
 368     else if (STR_NOCAPA_P(str)) {
 369         return RSTRING(str)->as.heap.len;
 370     }
 371     else {
 372         return RSTRING(str)->as.heap.aux.capa;
 373     }
 374 }
 375
 376 static inline VALUE
 377 str_alloc(VALUE klass)
 378 {
 379     NEWOBJ(str, struct RString);
 380     OBJSETUP(str, klass, T_STRING);
 381
 382     str->as.heap.ptr = 0;
 383     str->as.heap.len = 0;
 384     str->as.heap.aux.capa = 0;
 385
 386     return (VALUE)str;
 387 }
 388
 389 static VALUE
 390 str_new(VALUE klass, const char *ptr, long len)
 391 {
 392     VALUE str;
 393
 394     if (len < 0) {
 395         rb_raise(rb_eArgError, "negative string size (or size too big)");
 396     }
 397
 398     str = str_alloc(klass);
 399     if (len > RSTRING_EMBED_LEN_MAX) {
 400         RSTRING(str)->as.heap.aux.capa = len;
 401         RSTRING(str)->as.heap.ptr = ALLOC_N(char,len+1);
 402         STR_SET_NOEMBED(str);
 403     }
 404     if (ptr) {
 405         memcpy(RSTRING_PTR(str), ptr, len);
 406     }
 407     STR_SET_LEN(str, len);
 408     RSTRING_PTR(str)[len] = '\0';
 409     return str;
 410 }
 411
 412 VALUE
 413 rb_str_new(const char *ptr, long len)
 414 {
 415     return str_new(rb_cString, ptr, len);
 416 }
 417
 418 VALUE
 419 rb_usascii_str_new(const char *ptr, long len)
 420 {
 421     VALUE str = rb_str_new(ptr, len);
 422     ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
 423     return str;
 424 }
 425
 426 VALUE
 427 rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
 428 {
 429     VALUE str = rb_str_new(ptr, len);
 430     rb_enc_associate(str, enc);
 431     return str;
 432 }
 433
 434 VALUE
 435 rb_str_new_cstr(const char *ptr)
 436 {
 437     if (!ptr) {
 438         rb_raise(rb_eArgError, "NULL pointer given");
 439     }
 440     return rb_str_new(ptr, strlen(ptr));
 441 }
 442
 443 alias_func(rb_str_new2(const char *ptr), rb_str_new_cstr, (ptr))
 444 #define rb_str_new2 rb_str_new_cstr
 445
 446 VALUE
 447 rb_usascii_str_new_cstr(const char *ptr)
 448 {
 449     VALUE str = rb_str_new2(ptr);
 450     ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
 451     return str;
 452 }
 453
 454 alias_func(rb_usascii_str_new2(const char *ptr), rb_usascii_str_new_cstr, (ptr))
 455 #define rb_usascii_str_new2 rb_usascii_str_new_cstr
 456
 457 VALUE
 458 rb_tainted_str_new(const char *ptr, long len)
 459 {
 460     VALUE str = rb_str_new(ptr, len);
 461
 462     OBJ_TAINT(str);
 463     return str;
 464 }
 465
 466 VALUE
 467 rb_tainted_str_new_cstr(const char *ptr)
 468 {
 469     VALUE str = rb_str_new2(ptr);
 470
 471     OBJ_TAINT(str);
 472     return str;
 473 }
 474
 475 alias_func(rb_tainted_str_new2(const char *ptr), rb_tainted_str_new_cstr, (ptr))
 476 #define rb_tainted_str_new2 rb_tainted_str_new_cstr
 477
 478 static VALUE
 479 str_replace_shared(VALUE str2, VALUE str)
 480 {
 481     if (RSTRING_LEN(str) <= RSTRING_EMBED_LEN_MAX) {
 482         STR_SET_EMBED(str2);
 483         memcpy(RSTRING_PTR(str2), RSTRING_PTR(str), RSTRING_LEN(str)+1);
 484         STR_SET_EMBED_LEN(str2, RSTRING_LEN(str));
 485     }
 486     else {
 487         FL_SET(str2, STR_NOEMBED);
 488         RSTRING(str2)->as.heap.len = RSTRING_LEN(str);
 489         RSTRING(str2)->as.heap.ptr = RSTRING_PTR(str);
 490         RSTRING(str2)->as.heap.aux.shared = str;
 491         FL_SET(str2, ELTS_SHARED);
 492     }
 493     rb_enc_cr_str_exact_copy(str2, str);
 494
 495     return str2;
 496 }
 497
 498 static VALUE
 499 str_new_shared(VALUE klass, VALUE str)
 500 {
 501     return str_replace_shared(str_alloc(klass), str);
 502 }
 503
 504 static VALUE
 505 str_new3(VALUE klass, VALUE str)
 506 {
 507     return str_new_shared(klass, str);
 508 }
 509
 510 VALUE
 511 rb_str_new_shared(VALUE str)
 512 {
 513     VALUE str2 = str_new3(rb_obj_class(str), str);
 514
 515     OBJ_INFECT(str2, str);
 516     return str2;
 517 }
 518
 519 alias_func(rb_str_new3(VALUE str), rb_str_new_shared, (str))
 520 #define rb_str_new3 rb_str_new_shared
 521
 522 static VALUE
 523 str_new4(VALUE klass, VALUE str)
 524 {
 525     VALUE str2;
 526
 527     str2 = str_alloc(klass);
 528     STR_SET_NOEMBED(str2);
 529     RSTRING(str2)->as.heap.len = RSTRING_LEN(str);
 530     RSTRING(str2)->as.heap.ptr = RSTRING_PTR(str);
 531     if (STR_SHARED_P(str)) {
 532         FL_SET(str2, ELTS_SHARED);
 533         RSTRING(str2)->as.heap.aux.shared = RSTRING(str)->as.heap.aux.shared;
 534     }
 535     else {
 536         FL_SET(str, ELTS_SHARED);
 537         RSTRING(str)->as.heap.aux.shared = str2;
 538     }
 539     rb_enc_cr_str_exact_copy(str2, str);
 540     OBJ_INFECT(str2, str);
 541     return str2;
 542 }
 543
 544 VALUE
 545 rb_str_new_frozen(VALUE orig)
 546 {
 547     VALUE klass, str;
 548
 549     if (OBJ_FROZEN(orig)) return orig;
 550     klass = rb_obj_class(orig);
 551     if (STR_SHARED_P(orig) && (str = RSTRING(orig)->as.heap.aux.shared)) {
 552         long ofs;
 553         ofs = RSTRING_LEN(str) - RSTRING_LEN(orig);
 554         if ((ofs > 0) || (klass != RBASIC(str)->klass) ||
 555             (!OBJ_TAINTED(str) && OBJ_TAINTED(orig))) {
 556             str = str_new3(klass, str);
 557             RSTRING(str)->as.heap.ptr += ofs;
 558             RSTRING(str)->as.heap.len -= ofs;
 559         }
 560         rb_enc_cr_str_exact_copy(str, orig);
 561         OBJ_INFECT(str, orig);
 562     }
 563     else if (STR_EMBED_P(orig)) {
 564         str = str_new(klass, RSTRING_PTR(orig), RSTRING_LEN(orig));
 565         rb_enc_cr_str_exact_copy(str, orig);
 566         OBJ_INFECT(str, orig);
 567     }
 568     else if (STR_ASSOC_P(orig)) {
 569         VALUE assoc = RSTRING(orig)->as.heap.aux.shared;
 570         FL_UNSET(orig, STR_ASSOC);
 571         str = str_new4(klass, orig);
 572         FL_SET(str, STR_ASSOC);
 573         RSTRING(str)->as.heap.aux.shared = assoc;
 574     }
 575     else {
 576         str = str_new4(klass, orig);
 577     }
 578     OBJ_FREEZE(str);
 579     return str;
 580 }
 581
 582 alias_func(rb_str_new4(VALUE orig), rb_str_new_frozen, (orig))
 583 #define rb_str_new4 rb_str_new_frozen
 584
 585 VALUE
 586 rb_str_new_with_class(VALUE obj, const char *ptr, long len)
 587 {
 588     return str_new(rb_obj_class(obj), ptr, len);
 589 }
 590
 591 alias_func(rb_str_new5(VALUE obj, const char *ptr, long len),
 592            rb_str_new_with_class, (obj, ptr, len))
 593 #define rb_str_new5 rb_str_new_with_class
 594
 595 #define STR_BUF_MIN_SIZE 128
 596
 597 VALUE
 598 rb_str_buf_new(long capa)
 599 {
 600     VALUE str = str_alloc(rb_cString);
 601
 602     if (capa < STR_BUF_MIN_SIZE) {
 603         capa = STR_BUF_MIN_SIZE;
 604     }
 605     FL_SET(str, STR_NOEMBED);
 606     RSTRING(str)->as.heap.aux.capa = capa;
 607     RSTRING(str)->as.heap.ptr = ALLOC_N(char, capa+1);
 608     RSTRING(str)->as.heap.ptr[0] = '\0';
 609
 610     return str;
 611 }
 612
 613 VALUE
 614 rb_str_buf_new_cstr(const char *ptr)
 615 {
 616     VALUE str;
 617     long len = strlen(ptr);
 618
 619     str = rb_str_buf_new(len);
 620     rb_str_buf_cat(str, ptr, len);
 621
 622     return str;
 623 }
 624
 625 alias_func(rb_str_buf_new2(const char *ptr), rb_str_buf_new_cstr, (ptr))
 626 #define rb_str_buf_new2 rb_str_buf_new_cstr
 627
 628 VALUE
 629 rb_str_tmp_new(long len)
 630 {
 631     return str_new(0, 0, len);
 632 }
 633
 634 void
 635 rb_str_free(VALUE str)
 636 {
 637     if (!STR_EMBED_P(str) && !STR_SHARED_P(str)) {
 638         xfree(RSTRING(str)->as.heap.ptr);
 639     }
 640 }
 641
 642 VALUE
 643 rb_str_to_str(VALUE str)
 644 {
 645     return rb_convert_type(str, T_STRING, "String", "to_str");
 646 }
 647
 648 void
 649 rb_str_shared_replace(VALUE str, VALUE str2)
 650 {
 651     rb_encoding *enc;
 652     int cr;
 653     if (str == str2) return;
 654     enc = STR_ENC_GET(str2);
 655     cr = ENC_CODERANGE(str2);
 656     rb_str_modify(str);
 657     OBJ_INFECT(str, str2);
 658     if (!STR_SHARED_P(str) && !STR_EMBED_P(str)) {
 659         xfree(RSTRING_PTR(str));
 660     }
 661     if (RSTRING_LEN(str2) <= RSTRING_EMBED_LEN_MAX) {
 662         STR_SET_EMBED(str);
 663         memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), RSTRING_LEN(str2)+1);
 664         STR_SET_EMBED_LEN(str, RSTRING_LEN(str2));
 665         rb_enc_associate(str, enc);
 666         ENC_CODERANGE_SET(str, cr);
 667         return;
 668     }
 669     STR_SET_NOEMBED(str);
 670     STR_UNSET_NOCAPA(str);
 671     RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
 672     RSTRING(str)->as.heap.len = RSTRING_LEN(str2);
 673     if (STR_NOCAPA_P(str2)) {
 674         FL_SET(str, RBASIC(str2)->flags & STR_NOCAPA);
 675         RSTRING(str)->as.heap.aux.shared = RSTRING(str2)->as.heap.aux.shared;
 676     }
 677     else {
 678         RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
 679     }
 680     RSTRING(str2)->as.heap.ptr = 0;     /* abandon str2 */
 681     RSTRING(str2)->as.heap.len = 0;
 682     RSTRING(str2)->as.heap.aux.capa = 0;
 683     STR_UNSET_NOCAPA(str2);
 684     rb_enc_associate(str, enc);
 685     ENC_CODERANGE_SET(str, cr);
 686 }
 687
 688 static ID id_to_s;
 689
 690 VALUE
 691 rb_obj_as_string(VALUE obj)
 692 {
 693     VALUE str;
 694
 695     if (TYPE(obj) == T_STRING) {
 696         return obj;
 697     }
 698     str = rb_funcall(obj, id_to_s, 0);
 699     if (TYPE(str) != T_STRING)
 700         return rb_any_to_s(obj);
 701     if (OBJ_TAINTED(obj)) OBJ_TAINT(str);
 702     return str;
 703 }
 704
 705 static VALUE rb_str_replace(VALUE, VALUE);
 706
 707 VALUE
 708 rb_str_dup(VALUE str)
 709 {
 710     VALUE dup = str_alloc(rb_obj_class(str));
 711     rb_str_replace(dup, str);
 712     return dup;
 713 }
 714
 715
 716 /*
 717  *  call-seq:
 718  *     String.new(str="")   => new_str
 719  *
 720  *  Returns a new string object containing a copy of <i>str</i>.
 721  */
 722
 723 static VALUE
 724 rb_str_init(int argc, VALUE *argv, VALUE str)
 725 {
 726     VALUE orig;
 727
 728     if (argc > 0 && rb_scan_args(argc, argv, "01", &orig) == 1)
 729         rb_str_replace(str, orig);
 730     return str;
 731 }
 732
 733 long
 734 rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
 735 {
 736     long c;
 737     const char *q;
 738
 739     if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
 740         return (e - p + rb_enc_mbminlen(enc) - 1) / rb_enc_mbminlen(enc);
 741     }
 742     else if (rb_enc_asciicompat(enc)) {
 743         c = 0;
 744         while (p < e) {
 745             if (ISASCII(*p)) {
 746                 q = search_nonascii(p, e);
 747                 if (!q)
 748                     return c + (e - p);
 749                 c += q - p;
 750                 p = q;
 751             }
 752             p += rb_enc_mbclen(p, e, enc);
 753             c++;
 754         }
 755         return c;
 756     }
 757
 758     for (c=0; p<e; c++) {
 759         p += rb_enc_mbclen(p, e, enc);
 760     }
 761     return c;
 762 }
 763
 764 long
 765 rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
 766 {
 767     long c;
 768     const char *q;
 769     int ret;
 770
 771     *cr = 0;
 772     if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
 773         return (e - p + rb_enc_mbminlen(enc) - 1) / rb_enc_mbminlen(enc);
 774     }
 775     else if (rb_enc_asciicompat(enc)) {
 776         c = 0;
 777         while (p < e) {
 778             if (ISASCII(*p)) {
 779                 q = search_nonascii(p, e);
 780                 if (!q) {
 781                     if (!*cr) *cr = ENC_CODERANGE_7BIT;
 782                     return c + (e - p);
 783                 }
 784                 c += q - p;
 785                 p = q;
 786             }
 787             ret = rb_enc_precise_mbclen(p, e, enc);
 788             if (MBCLEN_CHARFOUND_P(ret)) {
 789                 *cr |= ENC_CODERANGE_VALID;
 790                 p += MBCLEN_CHARFOUND_LEN(ret);
 791             }
 792             else {
 793                 *cr = ENC_CODERANGE_BROKEN;
 794                 p++;
 795             }
 796             c++;
 797         }
 798         if (!*cr) *cr = ENC_CODERANGE_7BIT;
 799         return c;
 800     }
 801
 802     for (c=0; p<e; c++) {
 803         ret = rb_enc_precise_mbclen(p, e, enc);
 804         if (MBCLEN_CHARFOUND_P(ret)) {
 805             *cr |= ENC_CODERANGE_VALID;
 806             p += MBCLEN_CHARFOUND_LEN(ret);
 807         }
 808         else {
 809             *cr = ENC_CODERANGE_BROKEN;
 810             p++;
 811         }
 812     }
 813     if (!*cr) *cr = ENC_CODERANGE_7BIT;
 814     return c;
 815 }
 816
 817 #ifdef NONASCII_MASK
 818 #define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
 819 static inline VALUE
 820 count_utf8_lead_bytes_with_word(const VALUE *s)
 821 {
 822     VALUE d = *s;
 823     d |= ~(d>>1);
 824     d >>= 6;
 825     d &= NONASCII_MASK >> 7;
 826     d += (d>>8);
 827     d += (d>>16);
 828 #if SIZEOF_VALUE == 8
 829     d += (d>>32);
 830 #endif
 831     return (d&0xF);
 832 }
 833 #endif
 834
 835 static long
 836 str_strlen(VALUE str, rb_encoding *enc)
 837 {
 838     const char *p, *e;
 839     int n, cr;
 840
 841     if (single_byte_optimizable(str)) return RSTRING_LEN(str);
 842     if (!enc) enc = STR_ENC_GET(str);
 843     p = RSTRING_PTR(str);
 844     e = RSTRING_END(str);
 845 #ifdef NONASCII_MASK
 846     if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
 847         enc == rb_utf8_encoding()) {
 848         VALUE len = 0;
 849         if (sizeof(VALUE) * 2 < e - p) {
 850             const VALUE *s, *t;
 851             const VALUE lowbits = sizeof(VALUE) - 1;
 852             s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
 853             t = (const VALUE*)(~lowbits & (VALUE)e);
 854             while (p < (const char *)s) {
 855                 if (is_utf8_lead_byte(*p)) len++;
 856                 p++;
 857             }
 858             while (s < t) {
 859                 len += count_utf8_lead_bytes_with_word(s);
 860                 s++;
 861             }
 862             p = (const char *)s;
 863         }
 864         while (p < e) {
 865             if (is_utf8_lead_byte(*p)) len++;
 866             p++;
 867         }
 868         return (long)len;
 869     }
 870 #endif
 871     n = rb_enc_strlen_cr(p, e, enc, &cr);
 872     if (cr) {
 873         ENC_CODERANGE_SET(str, cr);
 874     }
 875     return n;
 876 }
 877
 878 /*
 879  *  call-seq:
 880  *     str.length   => integer
 881  *     str.size     => integer
 882  *
 883  *  Returns the character length of <i>str</i>.
 884  */
 885
 886 VALUE
 887 rb_str_length(VALUE str)
 888 {
 889     int len;
 890
 891     len = str_strlen(str, STR_ENC_GET(str));
 892     return INT2NUM(len);
 893 }
 894
 895 /*
 896  *  call-seq:
 897  *     str.bytesize  => integer
 898  *
 899  *  Returns the length of <i>str</i> in bytes.
 900  */
 901
 902 static VALUE
 903 rb_str_bytesize(VALUE str)
 904 {
 905     return INT2NUM(RSTRING_LEN(str));
 906 }
 907
 908 /*
 909  *  call-seq:
 910  *     str.empty?   => true or false
 911  *
 912  *  Returns <code>true</code> if <i>str</i> has a length of zero.
 913  *
 914  *     "hello".empty?   #=> false
 915  *     "".empty?        #=> true
 916  */
 917
 918 static VALUE
 919 rb_str_empty(VALUE str)
 920 {
 921     if (RSTRING_LEN(str) == 0)
 922         return Qtrue;
 923     return Qfalse;
 924 }
 925
 926 /*
 927  *  call-seq:
 928  *     str + other_str   => new_str
 929  *
 930  *  Concatenation---Returns a new <code>String</code> containing
 931  *  <i>other_str</i> concatenated to <i>str</i>.
 932  *
 933  *     "Hello from " + self.to_s   #=> "Hello from main"
 934  */
 935
 936 VALUE
 937 rb_str_plus(VALUE str1, VALUE str2)
 938 {
 939     VALUE str3;
 940     rb_encoding *enc;
 941
 942     StringValue(str2);
 943     enc = rb_enc_check(str1, str2);
 944     str3 = rb_str_new(0, RSTRING_LEN(str1)+RSTRING_LEN(str2));
 945     memcpy(RSTRING_PTR(str3), RSTRING_PTR(str1), RSTRING_LEN(str1));
 946     memcpy(RSTRING_PTR(str3) + RSTRING_LEN(str1),
 947            RSTRING_PTR(str2), RSTRING_LEN(str2));
 948     RSTRING_PTR(str3)[RSTRING_LEN(str3)] = '\0';
 949
 950     if (OBJ_TAINTED(str1) || OBJ_TAINTED(str2))
 951         OBJ_TAINT(str3);
 952     ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc),
 953                            ENC_CODERANGE_AND(ENC_CODERANGE(str1), ENC_CODERANGE(str2)));
 954     return str3;
 955 }
 956
 957 /*
 958  *  call-seq:
 959  *     str * integer   => new_str
 960  *
 961  *  Copy---Returns a new <code>String</code> containing <i>integer</i> copies of
 962  *  the receiver.
 963  *
 964  *     "Ho! " * 3   #=> "Ho! Ho! Ho! "
 965  */
 966
 967 VALUE
 968 rb_str_times(VALUE str, VALUE times)
 969 {
 970     VALUE str2;
 971     long n, len;
 972
 973     len = NUM2LONG(times);
 974     if (len < 0) {
 975         rb_raise(rb_eArgError, "negative argument");
 976     }
 977     if (len && LONG_MAX/len <  RSTRING_LEN(str)) {
 978         rb_raise(rb_eArgError, "argument too big");
 979     }
 980
 981     str2 = rb_str_new5(str, 0, len *= RSTRING_LEN(str));
 982     if (len) {
 983         n = RSTRING_LEN(str);
 984         memcpy(RSTRING_PTR(str2), RSTRING_PTR(str), n);
 985         while (n <= len/2) {
 986             memcpy(RSTRING_PTR(str2) + n, RSTRING_PTR(str2), n);
 987             n *= 2;
 988         }
 989         memcpy(RSTRING_PTR(str2) + n, RSTRING_PTR(str2), len-n);
 990     }
 991     RSTRING_PTR(str2)[RSTRING_LEN(str2)] = '\0';
 992     OBJ_INFECT(str2, str);
 993     rb_enc_cr_str_copy_for_substr(str2, str);
 994
 995     return str2;
 996 }
 997
 998 /*
 999  *  call-seq:
1000  *     str % arg   => new_str
1001  *
1002  *  Format---Uses <i>str</i> as a format specification, and returns the result
1003  *  of applying it to <i>arg</i>. If the format specification contains more than
1004  *  one substitution, then <i>arg</i> must be an <code>Array</code> containing
1005  *  the values to be substituted. See <code>Kernel::sprintf</code> for details
1006  *  of the format string.
1007  *
1008  *     "%05d" % 123                              #=> "00123"
1009  *     "%-5s: %08x" % [ "ID", self.object_id ]   #=> "ID   : 200e14d6"
1010  */
1011
1012 static VALUE
1013 rb_str_format_m(VALUE str, VALUE arg)
1014 {
1015     volatile VALUE tmp = rb_check_array_type(arg);
1016
1017     if (!NIL_P(tmp)) {
1018         return rb_str_format(RARRAY_LEN(tmp), RARRAY_PTR(tmp), str);
1019     }
1020     return rb_str_format(1, &arg, str);
1021 }
1022
1023 static inline void
1024 str_modifiable(VALUE str)
1025 {
1026     if (FL_TEST(str, STR_TMPLOCK)) {
1027         rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
1028     }
1029     if (OBJ_FROZEN(str)) rb_error_frozen("string");
1030     if (!OBJ_TAINTED(str) && rb_safe_level() >= 4)
1031         rb_raise(rb_eSecurityError, "Insecure: can't modify string");
1032 }
1033
1034 static inline int
1035 str_independent(VALUE str)
1036 {
1037     str_modifiable(str);
1038     if (!STR_SHARED_P(str)) return 1;
1039     if (STR_EMBED_P(str)) return 1;
1040     return 0;
1041 }
1042
1043 static void
1044 str_make_independent(VALUE str)
1045 {
1046     char *ptr;
1047     long len = RSTRING_LEN(str);
1048
1049     ptr = ALLOC_N(char, len+1);
1050     if (RSTRING_PTR(str)) {
1051         memcpy(ptr, RSTRING_PTR(str), len);
1052     }
1053     STR_SET_NOEMBED(str);
1054     ptr[len] = 0;
1055     RSTRING(str)->as.heap.ptr = ptr;
1056     RSTRING(str)->as.heap.len = len;
1057     RSTRING(str)->as.heap.aux.capa = len;
1058     STR_UNSET_NOCAPA(str);
1059 }
1060
1061 void
1062 rb_str_modify(VALUE str)
1063 {
1064     if (!str_independent(str))
1065         str_make_independent(str);
1066     ENC_CODERANGE_CLEAR(str);
1067 }
1068
1069 void
1070 rb_str_associate(VALUE str, VALUE add)
1071 {
1072     /* sanity check */
1073     if (OBJ_FROZEN(str)) rb_error_frozen("string");
1074     if (STR_ASSOC_P(str)) {
1075         /* already associated */
1076         rb_ary_concat(RSTRING(str)->as.heap.aux.shared, add);
1077     }
1078     else {
1079         if (STR_SHARED_P(str)) {
1080             VALUE assoc = RSTRING(str)->as.heap.aux.shared;
1081             str_make_independent(str);
1082             if (STR_ASSOC_P(assoc)) {
1083                 assoc = RSTRING(assoc)->as.heap.aux.shared;
1084                 rb_ary_concat(assoc, add);
1085                 add = assoc;
1086             }
1087         }
1088         else if (STR_EMBED_P(str)) {
1089             str_make_independent(str);
1090         }
1091         else if (RSTRING(str)->as.heap.aux.capa != RSTRING_LEN(str)) {
1092             RESIZE_CAPA(str, RSTRING_LEN(str));
1093         }
1094         FL_SET(str, STR_ASSOC);
1095         RBASIC(add)->klass = 0;
1096         RSTRING(str)->as.heap.aux.shared = add;
1097     }
1098 }
1099
1100 VALUE
1101 rb_str_associated(VALUE str)
1102 {
1103     if (STR_SHARED_P(str)) str = RSTRING(str)->as.heap.aux.shared;
1104     if (STR_ASSOC_P(str)) {
1105         return RSTRING(str)->as.heap.aux.shared;
1106     }
1107     return Qfalse;
1108 }
1109
1110 VALUE
1111 rb_string_value(volatile VALUE *ptr)
1112 {
1113     VALUE s = *ptr;
1114     if (TYPE(s) != T_STRING) {
1115         s = rb_str_to_str(s);
1116         *ptr = s;
1117     }
1118     return s;
1119 }
1120
1121 char *
1122 rb_string_value_ptr(volatile VALUE *ptr)
1123 {
1124     return RSTRING_PTR(rb_string_value(ptr));
1125 }
1126
1127 char *
1128 rb_string_value_cstr(volatile VALUE *ptr)
1129 {
1130     VALUE str = rb_string_value(ptr);
1131     char *s = RSTRING_PTR(str);
1132
1133     if (!s || RSTRING_LEN(str) != strlen(s)) {
1134         rb_raise(rb_eArgError, "string contains null byte");
1135     }
1136     return s;
1137 }
1138
1139 VALUE
1140 rb_check_string_type(VALUE str)
1141 {
1142     str = rb_check_convert_type(str, T_STRING, "String", "to_str");
1143     return str;
1144 }
1145
1146 /*
1147  *  call-seq:
1148  *     String.try_convert(obj) -> string or nil
1149  *
1150  *  Try to convert <i>obj</i> into a String, using to_str method.
1151  *  Returns converted regexp or nil if <i>obj</i> cannot be converted
1152  *  for any reason.
1153  *
1154  *     String.try_convert("str")     # => str
1155  *     String.try_convert(/re/)      # => nil
1156  */
1157 static VALUE
1158 rb_str_s_try_convert(VALUE dummy, VALUE str)
1159 {
1160     return rb_check_string_type(str);
1161 }
1162
1163 char*
1164 rb_enc_nth(const char *p, const char *e, int nth, rb_encoding *enc)
1165 {
1166     if (rb_enc_mbmaxlen(enc) == 1) {
1167         p += nth;
1168     }
1169     else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
1170         p += nth * rb_enc_mbmaxlen(enc);
1171     }
1172     else if (rb_enc_asciicompat(enc)) {
1173         const char *p2, *e2;
1174         int n;
1175
1176         while (p < e && 0 < nth) {
1177             e2 = p + nth;
1178             if (e < e2)
1179                 return (char *)e;
1180             if (ISASCII(*p)) {
1181                 p2 = search_nonascii(p, e2);
1182                 if (!p2)
1183                     return (char *)e2;
1184                 nth -= p2 - p;
1185                 p = p2;
1186             }
1187             n = rb_enc_mbclen(p, e, enc);
1188             p += n;
1189             nth--;
1190         }
1191         if (nth != 0)
1192             return (char *)e;
1193         return (char *)p;
1194     }
1195     else {
1196         while (p<e && nth--) {
1197             p += rb_enc_mbclen(p, e, enc);
1198         }
1199     }
1200     if (p > e) p = e;
1201     return (char*)p;
1202 }
1203
1204 static char*
1205 str_nth(const char *p, const char *e, int nth, rb_encoding *enc, int singlebyte)
1206 {
1207     if (singlebyte)
1208         p += nth;
1209     else {
1210         p = rb_enc_nth(p, e, nth, enc);
1211     }
1212     if (!p) return 0;
1213     if (p > e) p = e;
1214     return (char *)p;
1215 }
1216
1217 /* char offset to byte offset */
1218 static int
1219 str_offset(const char *p, const char *e, int nth, rb_encoding *enc, int singlebyte)
1220 {
1221     const char *pp = str_nth(p, e, nth, enc, singlebyte);
1222     if (!pp) return e - p;
1223     return pp - p;
1224 }
1225
1226 #ifdef NONASCII_MASK
1227 static char *
1228 str_utf8_nth(const char *p, const char *e, int nth)
1229 {
1230     if (sizeof(VALUE) * 2 < nth) {
1231         const VALUE *s, *t;
1232         const VALUE lowbits = sizeof(VALUE) - 1;
1233         s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
1234         t = (const VALUE*)(~lowbits & (VALUE)e);
1235         while (p < (const char *)s) {
1236             if (is_utf8_lead_byte(*p)) nth--;
1237             p++;
1238         }
1239         do {
1240             nth -= count_utf8_lead_bytes_with_word(s);
1241             s++;
1242         } while (s < t && sizeof(VALUE) <= nth);
1243         p = (char *)s;
1244     }
1245     while (p < e) {
1246         if (is_utf8_lead_byte(*p)) {
1247             if (nth == 0) break;
1248             nth--;
1249         }
1250         p++;
1251     }
1252     return (char *)p;
1253 }
1254
1255 static int
1256 str_utf8_offset(const char *p, const char *e, int nth)
1257 {
1258     const char *pp = str_utf8_nth(p, e, nth);
1259     if (!pp) return e - p;
1260     return pp - p;
1261 }
1262 #endif
1263
1264 /* byte offset to char offset */
1265 long
1266 rb_str_sublen(VALUE str, long pos)
1267 {
1268     if (single_byte_optimizable(str) || pos < 0)
1269         return pos;
1270     else {
1271         char *p = RSTRING_PTR(str);
1272         return rb_enc_strlen(p, p + pos, STR_ENC_GET(str));
1273     }
1274 }
1275
1276 VALUE
1277 rb_str_subseq(VALUE str, long beg, long len)
1278 {
1279     VALUE str2 = rb_str_new5(str, RSTRING_PTR(str)+beg, len);
1280
1281     rb_enc_cr_str_copy_for_substr(str2, str);
1282     OBJ_INFECT(str2, str);
1283
1284     return str2;
1285 }
1286
1287 VALUE
1288 rb_str_substr(VALUE str, long beg, long len)
1289 {
1290     rb_encoding *enc = STR_ENC_GET(str);
1291     VALUE str2;
1292     char *p, *s = RSTRING_PTR(str), *e = s + RSTRING_LEN(str);
1293     int singlebyte;
1294
1295     if (len < 0) return Qnil;
1296     if (!RSTRING_LEN(str)) {
1297         len = 0;
1298     }
1299     if (beg < 0) {
1300         if (len > -beg) len = -beg;
1301         if (-beg * rb_enc_mbmaxlen(enc) < RSTRING_LEN(str) / 8) {
1302             beg = -beg;
1303             while (beg-- > len && (e = rb_enc_prev_char(s, e, enc)) != 0);
1304             p = e;
1305             if (!p) return Qnil;
1306             while (len-- > 0 && (p = rb_enc_prev_char(s, p, enc)) != 0);
1307             if (!p) return Qnil;
1308             len = e - p;
1309             goto sub;
1310         }
1311         else {
1312             beg += str_strlen(str, enc);
1313             if (beg < 0) return Qnil;
1314         }
1315     }
1316     else if (beg > 0 && beg > str_strlen(str, enc)) {
1317         return Qnil;
1318     }
1319     singlebyte = single_byte_optimizable(str);
1320     if (len == 0) {
1321         p = 0;
1322     }
1323 #ifdef NONASCII_MASK
1324     else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
1325         enc == rb_utf8_encoding()) {
1326         p = str_utf8_nth(s, e, beg);
1327         len = str_utf8_offset(p, e, len);
1328     }
1329 #endif
1330     else if ((p = str_nth(s, e, beg, enc, singlebyte)) == e) {
1331         len = 0;
1332     }
1333     else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
1334         if (len * rb_enc_mbmaxlen(enc) > e - p)
1335             len = e - p;
1336         else
1337             len *= rb_enc_mbmaxlen(enc);
1338     }
1339     else {
1340         len = str_offset(p, e, len, enc, singlebyte);
1341     }
1342   sub:
1343     if (len > RSTRING_EMBED_LEN_MAX && beg + len == RSTRING_LEN(str)) {
1344         str2 = rb_str_new4(str);
1345         str2 = str_new3(rb_obj_class(str2), str2);
1346         RSTRING(str2)->as.heap.ptr += RSTRING(str2)->as.heap.len - len;
1347         RSTRING(str2)->as.heap.len = len;
1348     }
1349     else {
1350         str2 = rb_str_new5(str, p, len);
1351         rb_enc_cr_str_copy_for_substr(str2, str);
1352         OBJ_INFECT(str2, str);
1353     }
1354
1355     return str2;
1356 }
1357
1358 VALUE
1359 rb_str_freeze(VALUE str)
1360 {
1361     if (STR_ASSOC_P(str)) {
1362         VALUE ary = RSTRING(str)->as.heap.aux.shared;
1363         OBJ_FREEZE(ary);
1364     }
1365     return rb_obj_freeze(str);
1366 }
1367
1368 alias_func(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str))
1369 #define rb_str_dup_frozen rb_str_new_frozen
1370
1371 VALUE
1372 rb_str_locktmp(VALUE str)
1373 {
1374     if (FL_TEST(str, STR_TMPLOCK)) {
1375         rb_raise(rb_eRuntimeError, "temporal locking already locked string");
1376     }
1377     FL_SET(str, STR_TMPLOCK);
1378     return str;
1379 }
1380
1381 VALUE
1382 rb_str_unlocktmp(VALUE str)
1383 {
1384     if (!FL_TEST(str, STR_TMPLOCK)) {
1385         rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
1386     }
1387     FL_UNSET(str, STR_TMPLOCK);
1388     return str;
1389 }
1390
1391 void
1392 rb_str_set_len(VALUE str, long len)
1393 {
1394     STR_SET_LEN(str, len);
1395     RSTRING_PTR(str)[len] = '\0';
1396 }
1397
1398 VALUE
1399 rb_str_resize(VALUE str, long len)
1400 {
1401     long slen;
1402
1403     if (len < 0) {
1404         rb_raise(rb_eArgError, "negative string size (or size too big)");
1405     }
1406
1407     rb_str_modify(str);
1408     slen = RSTRING_LEN(str);
1409     if (len != slen) {
1410         if (STR_EMBED_P(str)) {
1411             char *ptr;
1412             if (len <= RSTRING_EMBED_LEN_MAX) {
1413                 STR_SET_EMBED_LEN(str, len);
1414                 RSTRING(str)->as.ary[len] = '\0';
1415                 return str;
1416             }
1417             ptr = ALLOC_N(char,len+1);
1418             MEMCPY(ptr, RSTRING(str)->as.ary, char, slen);
1419             RSTRING(str)->as.heap.ptr = ptr;
1420             STR_SET_NOEMBED(str);
1421         }
1422         else if (len <= RSTRING_EMBED_LEN_MAX) {
1423             char *ptr = RSTRING(str)->as.heap.ptr;
1424             STR_SET_EMBED(str);
1425             if (slen > 0) MEMCPY(RSTRING(str)->as.ary, ptr, char, len);
1426             RSTRING(str)->as.ary[len] = '\0';
1427             STR_SET_EMBED_LEN(str, len);
1428             xfree(ptr);
1429             return str;
1430         }
1431         else if (slen < len || slen - len > 1024) {
1432             REALLOC_N(RSTRING(str)->as.heap.ptr, char, len+1);
1433         }
1434         if (!STR_NOCAPA_P(str)) {
1435             RSTRING(str)->as.heap.aux.capa = len;
1436         }
1437         RSTRING(str)->as.heap.len = len;
1438         RSTRING(str)->as.heap.ptr[len] = '\0';  /* sentinel */
1439     }
1440     return str;
1441 }
1442
1443 static VALUE
1444 str_buf_cat(VALUE str, const char *ptr, long len)
1445 {
1446     long capa, total, off = -1;
1447
1448     if (ptr >= RSTRING_PTR(str) && ptr <= RSTRING_END(str)) {
1449         off = ptr - RSTRING_PTR(str);
1450     }
1451     rb_str_modify(str);
1452     if (len == 0) return 0;
1453     if (STR_ASSOC_P(str)) {
1454         FL_UNSET(str, STR_ASSOC);
1455         capa = RSTRING(str)->as.heap.aux.capa = RSTRING_LEN(str);
1456     }
1457     else if (STR_EMBED_P(str)) {
1458         capa = RSTRING_EMBED_LEN_MAX;
1459     }
1460     else {
1461         capa = RSTRING(str)->as.heap.aux.capa;
1462     }
1463     if (RSTRING_LEN(str) >= LONG_MAX - len) {
1464         rb_raise(rb_eArgError, "string sizes too big");
1465     }
1466     total = RSTRING_LEN(str)+len;
1467     if (capa <= total) {
1468         while (total > capa) {
1469             if (capa + 1 >= LONG_MAX / 2) {
1470                 capa = (total + 4095) / 4096;
1471                 break;
1472             }
1473             capa = (capa + 1) * 2;
1474         }
1475         RESIZE_CAPA(str, capa);
1476     }
1477     if (off != -1) {
1478         ptr = RSTRING_PTR(str) + off;
1479     }
1480     memcpy(RSTRING_PTR(str) + RSTRING_LEN(str), ptr, len);
1481     STR_SET_LEN(str, total);
1482     RSTRING_PTR(str)[total] = '\0'; /* sentinel */
1483
1484     return str;
1485 }
1486
1487 VALUE
1488 rb_str_buf_cat(VALUE str, const char *ptr, long len)
1489 {
1490     if (len == 0) return str;
1491     if (len < 0) {
1492         rb_raise(rb_eArgError, "negative string size (or size too big)");
1493     }
1494     return str_buf_cat(str, ptr, len);
1495 }
1496
1497 VALUE
1498 rb_str_buf_cat2(VALUE str, const char *ptr)
1499 {
1500     return rb_str_buf_cat(str, ptr, strlen(ptr));
1501 }
1502
1503 VALUE
1504 rb_str_cat(VALUE str, const char *ptr, long len)
1505 {
1506     if (len < 0) {
1507         rb_raise(rb_eArgError, "negative string size (or size too big)");
1508     }
1509     if (STR_ASSOC_P(str)) {
1510         rb_str_modify(str);
1511         if (STR_EMBED_P(str)) str_make_independent(str);
1512         REALLOC_N(RSTRING(str)->as.heap.ptr, char, RSTRING(str)->as.heap.len+len+1);
1513         memcpy(RSTRING(str)->as.heap.ptr + RSTRING(str)->as.heap.len, ptr, len);
1514         RSTRING(str)->as.heap.len += len;
1515         RSTRING(str)->as.heap.ptr[RSTRING(str)->as.heap.len] = '\0'; /* sentinel */
1516         return str;
1517     }
1518
1519     return rb_str_buf_cat(str, ptr, len);
1520 }
1521
1522 VALUE
1523 rb_str_cat2(VALUE str, const char *ptr)
1524 {
1525     return rb_str_cat(str, ptr, strlen(ptr));
1526 }
1527
1528 static VALUE
1529 rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
1530     int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
1531 {
1532     int str_encindex = ENCODING_GET(str);
1533     int res_encindex;
1534     int str_cr, res_cr;
1535     int str_a8 = ENCODING_IS_ASCII8BIT(str);
1536     int ptr_a8 = ptr_encindex == 0;
1537
1538     str_cr = ENC_CODERANGE(str);
1539
1540     if (str_encindex == ptr_encindex) {
1541         if (str_cr == ENC_CODERANGE_UNKNOWN ||
1542             (ptr_a8 && str_cr != ENC_CODERANGE_7BIT)) {
1543             ptr_cr = ENC_CODERANGE_UNKNOWN;
1544         }
1545         else if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
1546             ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
1547         }
1548     }
1549     else {
1550         rb_encoding *str_enc = rb_enc_from_index(str_encindex);
1551         rb_encoding *ptr_enc = rb_enc_from_index(ptr_encindex);
1552         if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
1553             if (len == 0)
1554                 return str;
1555             if (RSTRING_LEN(str) == 0) {
1556                 rb_str_buf_cat(str, ptr, len);
1557                 ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
1558                 return str;
1559             }
1560             goto incompatible;
1561         }
1562         if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
1563             ptr_cr = coderange_scan(ptr, len, ptr_enc);
1564         }
1565         if (str_cr == ENC_CODERANGE_UNKNOWN) {
1566             if (str_a8 || ptr_cr != ENC_CODERANGE_7BIT) {
1567                 str_cr = rb_enc_str_coderange(str);
1568             }
1569         }
1570     }
1571     if (ptr_cr_ret)
1572         *ptr_cr_ret = ptr_cr;
1573
1574     if (str_encindex != ptr_encindex &&
1575         str_cr != ENC_CODERANGE_7BIT &&
1576         ptr_cr != ENC_CODERANGE_7BIT) {
1577       incompatible:
1578         rb_raise(rb_eArgError, "append incompatible encoding strings: %s and %s",
1579             rb_enc_name(rb_enc_from_index(str_encindex)),
1580             rb_enc_name(rb_enc_from_index(ptr_encindex)));
1581     }
1582
1583     if (str_cr == ENC_CODERANGE_UNKNOWN) {
1584         res_encindex = str_encindex;
1585         res_cr = ENC_CODERANGE_UNKNOWN;
1586     }
1587     else if (str_cr == ENC_CODERANGE_7BIT) {
1588         if (ptr_cr == ENC_CODERANGE_7BIT) {
1589             res_encindex = !str_a8 ? str_encindex : ptr_encindex;
1590             res_cr = ENC_CODERANGE_7BIT;
1591         }
1592         else {
1593             res_encindex = ptr_encindex;
1594             res_cr = ptr_cr;
1595         }
1596     }
1597     else if (str_cr == ENC_CODERANGE_VALID) {
1598         res_encindex = str_encindex;
1599         res_cr = str_cr;
1600     }
1601     else { /* str_cr == ENC_CODERANGE_BROKEN */
1602         res_encindex = str_encindex;
1603         res_cr = str_cr;
1604         if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
1605     }
1606
1607     if (len < 0) {
1608         rb_raise(rb_eArgError, "negative string size (or size too big)");
1609     }
1610     str_buf_cat(str, ptr, len);
1611     ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
1612     return str;
1613 }
1614
1615 VALUE
1616 rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
1617 {
1618     return rb_enc_cr_str_buf_cat(str, ptr, len,
1619         rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL);
1620 }
1621
1622 VALUE
1623 rb_str_buf_cat_ascii(VALUE str, const char *ptr)
1624 {
1625     /* ptr must reference NUL terminated ASCII string. */
1626     int encindex = ENCODING_GET(str);
1627     rb_encoding *enc = rb_enc_from_index(encindex);
1628     if (rb_enc_asciicompat(enc)) {
1629         return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
1630             encindex, ENC_CODERANGE_7BIT, 0);
1631     }
1632     else {
1633         char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
1634         while (*ptr) {
1635             int c = (unsigned char)*ptr;
1636             int len = rb_enc_codelen(c, enc);
1637             rb_enc_mbcput(c, buf, enc);
1638             rb_enc_cr_str_buf_cat(str, buf, len,
1639                 encindex, ENC_CODERANGE_VALID, 0);
1640             ptr++;
1641         }
1642         return str;
1643     }
1644 }
1645
1646 VALUE
1647 rb_str_buf_append(VALUE str, VALUE str2)
1648 {
1649     int str2_cr;
1650
1651     str2_cr = ENC_CODERANGE(str2);
1652
1653     rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
1654         ENCODING_GET(str2), str2_cr, &str2_cr);
1655
1656     OBJ_INFECT(str, str2);
1657     ENC_CODERANGE_SET(str2, str2_cr);
1658
1659     return str;
1660 }
1661
1662 VALUE
1663 rb_str_append(VALUE str, VALUE str2)
1664 {
1665     rb_encoding *enc;
1666     int cr, cr2;
1667
1668     StringValue(str2);
1669     if (RSTRING_LEN(str2) > 0 && STR_ASSOC_P(str)) {
1670         long len = RSTRING_LEN(str)+RSTRING_LEN(str2);
1671         enc = rb_enc_check(str, str2);
1672         cr = ENC_CODERANGE(str);
1673         if ((cr2 = ENC_CODERANGE(str2)) > cr) cr = cr2;
1674         rb_str_modify(str);
1675         REALLOC_N(RSTRING(str)->as.heap.ptr, char, len+1);
1676         memcpy(RSTRING(str)->as.heap.ptr + RSTRING(str)->as.heap.len,
1677                RSTRING_PTR(str2), RSTRING_LEN(str2)+1);
1678         RSTRING(str)->as.heap.len = len;
1679         rb_enc_associate(str, enc);
1680         ENC_CODERANGE_SET(str, cr);
1681         OBJ_INFECT(str, str2);
1682         return str;
1683     }
1684     return rb_str_buf_append(str, str2);
1685 }
1686
1687
1688 /*
1689  *  call-seq:
1690  *     str << fixnum        => str
1691  *     str.concat(fixnum)   => str
1692  *     str << obj           => str
1693  *     str.concat(obj)      => str
1694  *
1695  *  Append---Concatenates the given object to <i>str</i>. If the object is a
1696  *  <code>Fixnum</code>, it is considered as a codepoint, and is converted
1697  *  to a character before concatenation.
1698  *
1699  *     a = "hello "
1700  *     a << "world"   #=> "hello world"
1701  *     a.concat(33)   #=> "hello world!"
1702  */
1703
1704 VALUE
1705 rb_str_concat(VALUE str1, VALUE str2)
1706 {
1707     if (FIXNUM_P(str2)) {
1708         rb_encoding *enc = STR_ENC_GET(str1);
1709         int c = FIX2INT(str2);
1710         int pos = RSTRING_LEN(str1);
1711         int len = rb_enc_codelen(c, enc);
1712         int cr = ENC_CODERANGE(str1);
1713
1714         rb_str_resize(str1, pos+len);
1715         rb_enc_mbcput(c, RSTRING_PTR(str1)+pos, enc);
1716         ENC_CODERANGE_SET(str1, cr);
1717         return str1;
1718     }
1719     return rb_str_append(str1, str2);
1720 }
1721
1722 #if defined __i386__ || defined _M_IX86
1723 #define UNALIGNED_WORD_ACCESS 1
1724 #endif
1725 #ifndef UNALIGNED_WORD_ACCESS
1726 #define UNALIGNED_WORD_ACCESS 0
1727 #endif
1728
1729 /* MurmurHash described in http://murmurhash.googlepages.com/ */
1730 static unsigned int
1731 hash(const unsigned char * data, int len, unsigned int h)
1732 {
1733     const unsigned int m = 0x7fd652ad;
1734     const int r = 16;
1735
1736     h += 0xdeadbeef;
1737
1738     if (len >= 4) {
1739 #if !UNALIGNED_WORD_ACCESS
1740         int align = (VALUE)data & 3;
1741         if (align) {
1742             uint32_t t = 0, d = 0;
1743             int sl, sr, pack;
1744
1745             switch (align) {
1746 #ifdef WORDS_BIGENDIAN
1747               case 1: t |= data[2];
1748               case 2: t |= data[1] << 8;
1749               case 3: t |= data[0] << 16;
1750 #else
1751               case 1: t |= data[2] << 16;
1752               case 2: t |= data[1] << 8;
1753               case 3: t |= data[0];
1754 #endif
1755             }
1756
1757 #ifdef WORDS_BIGENDIAN
1758             t >>= (8 * align) - 8;
1759 #else
1760             t <<= (8 * align);
1761 #endif
1762
1763             data += 4-align;
1764             len -= 4-align;
1765
1766             sl = 8 * (4-align);
1767             sr = 8 * align;
1768
1769             while (len >= 4) {
1770                 d = *(uint32_t *)data;
1771 #ifdef WORDS_BIGENDIAN
1772                 t = (t << sr) | (d >> sl);
1773 #else
1774                 t = (t >> sr) | (d << sl);
1775 #endif
1776                 h += t;
1777                 h *= m;
1778                 h ^= h >> r;
1779                 t = d;
1780
1781                 data += 4;
1782                 len -= 4;
1783             }
1784
1785             pack = len < align ? len : align;
1786             d = 0;
1787             switch (pack) {
1788 #ifdef WORDS_BIGENDIAN
1789               case 3: d |= data[2] << 8;
1790               case 2: d |= data[1] << 16;
1791               case 1: d |= data[0] << 24;
1792               case 0:
1793                 h += (t << sr) | (d >> sl);
1794 #else
1795               case 3: d |= data[2] << 16;
1796               case 2: d |= data[1] << 8;
1797               case 1: d |= data[0];
1798               case 0:
1799                 h += (t >> sr) | (d << sl);
1800 #endif
1801                 h *= m;
1802                 h ^= h >> r;
1803             }
1804
1805             data += pack;
1806             len -= pack;
1807         }
1808         else
1809 #endif
1810         {
1811             do {
1812                 h += *(uint32_t *)data;
1813                 h *= m;
1814                 h ^= h >> r;
1815
1816                 data += 4;
1817                 len -= 4;
1818             } while (len >= 4);
1819         }
1820     }
1821
1822     switch(len) {
1823 #ifdef WORDS_BIGENDIAN
1824       case 3:
1825         h += data[2] << 8;
1826       case 2:
1827         h += data[1] << 16;
1828       case 1:
1829         h += data[0] << 24;
1830 #else
1831       case 3:
1832         h += data[2] << 16;
1833       case 2:
1834         h += data[1] << 8;
1835       case 1:
1836         h += data[0];
1837 #endif
1838         h *= m;
1839         h ^= h >> r;
1840     }
1841
1842     h *= m;
1843     h ^= h >> 10;
1844     h *= m;
1845     h ^= h >> 17;
1846
1847     return h;
1848 }
1849
1850 int
1851 rb_memhash(const void *ptr, long len)
1852 {
1853     static int hashseed_init = 0;
1854     static unsigned int hashseed;
1855
1856     if (!hashseed_init) {
1857         hashseed = rb_genrand_int32();
1858         hashseed_init = 1;
1859     }
1860
1861     return hash(ptr, len, hashseed);
1862 }
1863
1864 int
1865 rb_str_hash(VALUE str)
1866 {
1867     return rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str));
1868 }
1869
1870 int
1871 rb_str_hash_cmp(VALUE str1, VALUE str2)
1872 {
1873     int len;
1874
1875     if (!rb_str_comparable(str1, str2)) return 1;
1876     if (RSTRING_LEN(str1) == (len = RSTRING_LEN(str2)) &&
1877         memcmp(RSTRING_PTR(str1), RSTRING_PTR(str2), len) == 0) {
1878         return 0;
1879     }
1880     return 1;
1881 }
1882
1883 /*
1884  * call-seq:
1885  *    str.hash   => fixnum
1886  *
1887  * Return a hash based on the string's length and content.
1888  */
1889
1890 static VALUE
1891 rb_str_hash_m(VALUE str)
1892 {
1893     int hval = rb_str_hash(str);
1894     return INT2FIX(hval);
1895 }
1896
1897 #define lesser(a,b) (((a)>(b))?(b):(a))
1898
1899 int
1900 rb_str_comparable(VALUE str1, VALUE str2)
1901 {
1902     int idx1, idx2;
1903     int rc1, rc2;
1904
1905     if (RSTRING_LEN(str1) == 0) return Qtrue;
1906     if (RSTRING_LEN(str2) == 0) return Qtrue;
1907     idx1 = ENCODING_GET(str1);
1908     idx2 = ENCODING_GET(str2);
1909     if (idx1 == idx2) return Qtrue;
1910     rc1 = rb_enc_str_coderange(str1);
1911     rc2 = rb_enc_str_coderange(str2);
1912     if (rc1 == ENC_CODERANGE_7BIT) {
1913         if (rc2 == ENC_CODERANGE_7BIT) return Qtrue;
1914         if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
1915             return Qtrue;
1916     }
1917     if (rc2 == ENC_CODERANGE_7BIT) {
1918         if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
1919             return Qtrue;
1920     }
1921     return Qfalse;
1922 }
1923
1924 int
1925 rb_str_cmp(VALUE str1, VALUE str2)
1926 {
1927     long len;
1928     int retval;
1929
1930     len = lesser(RSTRING_LEN(str1), RSTRING_LEN(str2));
1931     retval = memcmp(RSTRING_PTR(str1), RSTRING_PTR(str2), len);
1932     if (retval == 0) {
1933         if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) {
1934             if (!rb_enc_compatible(str1, str2)) {
1935                 if (ENCODING_GET(str1) - ENCODING_GET(str2) > 0)
1936                     return 1;
1937                 return -1;
1938             }
1939             return 0;
1940         }
1941         if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return 1;
1942         return -1;
1943     }
1944     if (retval > 0) return 1;
1945     return -1;
1946 }
1947
1948
1949 /*
1950  *  call-seq:
1951  *     str == obj   => true or false
1952  *
1953  *  Equality---If <i>obj</i> is not a <code>String</code>, returns
1954  *  <code>false</code>. Otherwise, returns <code>true</code> if <i>str</i>
1955  *  <code><=></code> <i>obj</i> returns zero.
1956  */
1957
1958 VALUE
1959 rb_str_equal(VALUE str1, VALUE str2)
1960 {
1961     int len;
1962
1963     if (str1 == str2) return Qtrue;
1964     if (TYPE(str2) != T_STRING) {
1965         if (!rb_respond_to(str2, rb_intern("to_str"))) {
1966             return Qfalse;
1967         }
1968         return rb_equal(str2, str1);
1969     }
1970     if (!rb_str_comparable(str1, str2)) return Qfalse;
1971     if (RSTRING_LEN(str1) == (len = RSTRING_LEN(str2)) &&
1972         memcmp(RSTRING_PTR(str1), RSTRING_PTR(str2), len) == 0) {
1973         return Qtrue;
1974     }
1975     return Qfalse;
1976 }
1977
1978 /*
1979  * call-seq:
1980  *   str.eql?(other)   => true or false
1981  *
1982  * Two strings are equal if the have the same length and content.
1983  */
1984
1985 static VALUE
1986 rb_str_eql(VALUE str1, VALUE str2)
1987 {
1988     if (TYPE(str2) != T_STRING || RSTRING_LEN(str1) != RSTRING_LEN(str2))
1989         return Qfalse;
1990
1991     if (!rb_str_comparable(str1, str2)) return Qfalse;
1992     if (memcmp(RSTRING_PTR(str1), RSTRING_PTR(str2),
1993                lesser(RSTRING_LEN(str1), RSTRING_LEN(str2))) == 0)
1994         return Qtrue;
1995
1996     return Qfalse;
1997 }
1998
1999 /*
2000  *  call-seq:
2001  *     str <=> other_str   => -1, 0, +1
2002  *
2003  *  Comparison---Returns -1 if <i>other_str</i> is less than, 0 if
2004  *  <i>other_str</i> is equal to, and +1 if <i>other_str</i> is greater than
2005  *  <i>str</i>. If the strings are of different lengths, and the strings are
2006  *  equal when compared up to the shortest length, then the longer string is
2007  *  considered greater than the shorter one. In older versions of Ruby, setting
2008  *  <code>$=</code> allowed case-insensitive comparisons; this is now deprecated
2009  *  in favor of using <code>String#casecmp</code>.
2010  *
2011  *  <code><=></code> is the basis for the methods <code><</code>,
2012  *  <code><=</code>, <code>></code>, <code>>=</code>, and <code>between?</code>,
2013  *  included from module <code>Comparable</code>.  The method
2014  *  <code>String#==</code> does not use <code>Comparable#==</code>.
2015  *
2016  *     "abcdef" <=> "abcde"     #=> 1
2017  *     "abcdef" <=> "abcdef"    #=> 0
2018  *     "abcdef" <=> "abcdefg"   #=> -1
2019  *     "abcdef" <=> "ABCDEF"    #=> 1
2020  */
2021
2022 static VALUE
2023 rb_str_cmp_m(VALUE str1, VALUE str2)
2024 {
2025     long result;
2026
2027     if (TYPE(str2) != T_STRING) {
2028         if (!rb_respond_to(str2, rb_intern("to_str"))) {
2029             return Qnil;
2030         }
2031         else if (!rb_respond_to(str2, rb_intern("<=>"))) {
2032             return Qnil;
2033         }
2034         else {
2035             VALUE tmp = rb_funcall(str2, rb_intern("<=>"), 1, str1);
2036
2037             if (NIL_P(tmp)) return Qnil;
2038             if (!FIXNUM_P(tmp)) {
2039                 return rb_funcall(LONG2FIX(0), '-', 1, tmp);
2040             }
2041             result = -FIX2LONG(tmp);
2042         }
2043     }
2044     else {
2045         result = rb_str_cmp(str1, str2);
2046     }
2047     return LONG2NUM(result);
2048 }
2049
2050 /*
2051  *  call-seq:
2052  *     str.casecmp(other_str)   => -1, 0, +1
2053  *
2054  *  Case-insensitive version of <code>String#<=></code>.
2055  *
2056  *     "abcdef".casecmp("abcde")     #=> 1
2057  *     "aBcDeF".casecmp("abcdef")    #=> 0
2058  *     "abcdef".casecmp("abcdefg")   #=> -1
2059  *     "abcdef".casecmp("ABCDEF")    #=> 0
2060  */
2061
2062 static VALUE
2063 rb_str_casecmp(VALUE str1, VALUE str2)
2064 {
2065     long len;
2066     rb_encoding *enc;
2067     char *p1, *p1end, *p2, *p2end;
2068
2069     StringValue(str2);
2070     enc = rb_enc_compatible(str1, str2);
2071     if (!enc) {
2072         return Qnil;
2073     }
2074
2075     p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
2076     p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
2077     while (p1 < p1end && p2 < p2end) {
2078         int c1 = rb_enc_codepoint(p1, p1end, enc);
2079         int c2 = rb_enc_codepoint(p2, p2end, enc);
2080
2081         if (c1 != c2) {
2082             c1 = rb_enc_toupper(c1, enc);
2083             c2 = rb_enc_toupper(c2, enc);
2084             if (c1 > c2) return INT2FIX(1);
2085             if (c1 < c2) return INT2FIX(-1);
2086         }
2087         len = rb_enc_codelen(c1, enc);
2088         p1 += len;
2089         p2 += len;
2090     }
2091     if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0);
2092     if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1);
2093     return INT2FIX(-1);
2094 }
2095
2096 static long
2097 rb_str_index(VALUE str, VALUE sub, long offset)
2098 {
2099     long pos;
2100     char *s, *sptr;
2101     long len, slen;
2102     rb_encoding *enc;
2103
2104     enc = rb_enc_check(str, sub);
2105     if (is_broken_string(sub)) {
2106         return -1;
2107     }
2108     len = str_strlen(str, enc);
2109     slen = str_strlen(sub, enc);
2110     if (offset < 0) {
2111         offset += len;
2112         if (offset < 0) return -1;
2113     }
2114     if (len - offset < slen) return -1;
2115     s = RSTRING_PTR(str);
2116     if (offset) {
2117         offset = str_offset(s, RSTRING_END(str), offset, enc, single_byte_optimizable(str));
2118         s += offset;
2119     }
2120     if (slen == 0) return offset;
2121     /* need proceed one character at a time */
2122     sptr = RSTRING_PTR(sub);
2123     slen = RSTRING_LEN(sub);
2124     len = RSTRING_LEN(str) - offset;
2125     for (;;) {
2126         char *t;
2127         pos = rb_memsearch(sptr, slen, s, len, enc);
2128         if (pos < 0) return pos;
2129         t = rb_enc_right_char_head(s, s+pos, enc);
2130         if (t == s + pos) break;
2131         if ((len -= t - s) <= 0) return -1;
2132         offset += t - s;
2133         s = t;
2134     }
2135     return pos + offset;
2136 }
2137
2138
2139 /*
2140  *  call-seq:
2141  *     str.index(substring [, offset])   => fixnum or nil
2142  *     str.index(regexp [, offset])      => fixnum or nil
2143  *
2144  *  Returns the index of the first occurrence of the given <i>substring</i> or
2145  *  pattern (<i>regexp</i>) in <i>str</i>. Returns <code>nil</code> if not
2146  *  found. If the second parameter is present, it specifies the position in the
2147  *  string to begin the search.
2148  *
2149  *     "hello".index('e')             #=> 1
2150  *     "hello".index('lo')            #=> 3
2151  *     "hello".index('a')             #=> nil
2152  *     "hello".index(?e)              #=> 1
2153  *     "hello".index(/[aeiou]/, -3)   #=> 4
2154  */
2155
2156 static VALUE
2157 rb_str_index_m(int argc, VALUE *argv, VALUE str)
2158 {
2159     VALUE sub;
2160     VALUE initpos;
2161     long pos;
2162
2163     if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
2164         pos = NUM2LONG(initpos);
2165     }
2166     else {
2167         pos = 0;
2168     }
2169     if (pos < 0) {
2170         pos += str_strlen(str, STR_ENC_GET(str));
2171         if (pos < 0) {
2172             if (TYPE(sub) == T_REGEXP) {
2173                 rb_backref_set(Qnil);
2174             }
2175             return Qnil;
2176         }
2177     }
2178
2179     switch (TYPE(sub)) {
2180       case T_REGEXP:
2181         pos = rb_reg_adjust_startpos(sub, str, pos, 0);
2182         pos = rb_reg_search(sub, str, pos, 0);
2183         pos = rb_str_sublen(str, pos);
2184         break;
2185
2186       default: {
2187         VALUE tmp;
2188
2189         tmp = rb_check_string_type(sub);
2190         if (NIL_P(tmp)) {
2191             rb_raise(rb_eTypeError, "type mismatch: %s given",
2192                      rb_obj_classname(sub));
2193         }
2194         sub = tmp;
2195       }
2196         /* fall through */
2197       case T_STRING:
2198         pos = rb_str_index(str, sub, pos);
2199         pos = rb_str_sublen(str, pos);
2200         break;
2201     }
2202
2203     if (pos == -1) return Qnil;
2204     return LONG2NUM(pos);
2205 }
2206
2207 static long
2208 rb_str_rindex(VALUE str, VALUE sub, long pos)
2209 {
2210     long len, slen;
2211     char *s, *sbeg, *e, *t;
2212     rb_encoding *enc;
2213     int singlebyte = single_byte_optimizable(str);
2214
2215     enc = rb_enc_check(str, sub);
2216     if (is_broken_string(sub)) {
2217         return -1;
2218     }
2219     len = str_strlen(str, enc);
2220     slen = str_strlen(sub, enc);
2221     /* substring longer than string */
2222     if (len < slen) return -1;
2223     if (len - pos < slen) {
2224         pos = len - slen;
2225     }
2226     if (len == 0) {
2227         return pos;
2228     }
2229     sbeg = RSTRING_PTR(str);
2230     e = RSTRING_END(str);
2231     t = RSTRING_PTR(sub);
2232     slen = RSTRING_LEN(sub);
2233     for (;;) {
2234         s = str_nth(sbeg, e, pos, enc, singlebyte);
2235         if (!s) return -1;
2236         if (memcmp(s, t, slen) == 0) {
2237             return pos;
2238         }
2239         if (pos == 0) break;
2240         pos--;
2241     }
2242     return -1;
2243 }
2244
2245
2246 /*
2247  *  call-seq:
2248  *     str.rindex(substring [, fixnum])   => fixnum or nil
2249  *     str.rindex(regexp [, fixnum])   => fixnum or nil
2250  *
2251  *  Returns the index of the last occurrence of the given <i>substring</i> or
2252  *  pattern (<i>regexp</i>) in <i>str</i>. Returns <code>nil</code> if not
2253  *  found. If the second parameter is present, it specifies the position in the
2254  *  string to end the search---characters beyond this point will not be
2255  *  considered.
2256  *
2257  *     "hello".rindex('e')             #=> 1
2258  *     "hello".rindex('l')             #=> 3
2259  *     "hello".rindex('a')             #=> nil
2260  *     "hello".rindex(?e)              #=> 1
2261  *     "hello".rindex(/[aeiou]/, -2)   #=> 1
2262  */
2263
2264 static VALUE
2265 rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
2266 {
2267     VALUE sub;
2268     VALUE vpos;
2269     rb_encoding *enc = STR_ENC_GET(str);
2270     long pos, len = str_strlen(str, enc);
2271
2272     if (rb_scan_args(argc, argv, "11", &sub, &vpos) == 2) {
2273         pos = NUM2LONG(vpos);
2274         if (pos < 0) {
2275             pos += len;
2276             if (pos < 0) {
2277                 if (TYPE(sub) == T_REGEXP) {
2278                     rb_backref_set(Qnil);
2279                 }
2280                 return Qnil;
2281             }
2282         }
2283         if (pos > len) pos = len;
2284     }
2285     else {
2286         pos = len;
2287     }
2288
2289     switch (TYPE(sub)) {
2290       case T_REGEXP:
2291         /* enc = rb_get_check(str, sub); */
2292         if (!RREGEXP(sub)->ptr || RREGEXP_SRC_LEN(sub)) {
2293             pos = rb_reg_adjust_startpos(sub, str, pos, 1);
2294             pos = rb_reg_search(sub, str, pos, 1);
2295             pos = rb_str_sublen(str, pos);
2296         }
2297         if (pos >= 0) return LONG2NUM(pos);
2298         break;
2299
2300       default: {
2301         VALUE tmp;
2302
2303         tmp = rb_check_string_type(sub);
2304         if (NIL_P(tmp)) {
2305             rb_raise(rb_eTypeError, "type mismatch: %s given",
2306                      rb_obj_classname(sub));
2307         }
2308         sub = tmp;
2309       }
2310         /* fall through */
2311       case T_STRING:
2312         pos = rb_str_rindex(str, sub, pos);
2313         if (pos >= 0) return LONG2NUM(pos);
2314         break;
2315     }
2316     return Qnil;
2317 }
2318
2319 /*
2320  *  call-seq:
2321  *     str =~ obj   => fixnum or nil
2322  *
2323  *  Match---If <i>obj</i> is a <code>Regexp</code>, use it as a pattern to match
2324  *  against <i>str</i>,and returns the position the match starts, or
2325  *  <code>nil</code> if there is no match. Otherwise, invokes
2326  *  <i>obj.=~</i>, passing <i>str</i> as an argument. The default
2327  *  <code>=~</code> in <code>Object</code> returns <code>false</code>.
2328  *
2329  *     "cat o' 9 tails" =~ /\d/   #=> 7
2330  *     "cat o' 9 tails" =~ 9      #=> nil
2331  */
2332
2333 static VALUE
2334 rb_str_match(VALUE x, VALUE y)
2335 {
2336     switch (TYPE(y)) {
2337       case T_STRING:
2338         rb_raise(rb_eTypeError, "type mismatch: String given");
2339
2340       case T_REGEXP:
2341         return rb_reg_match(y, x);
2342
2343       default:
2344         return rb_funcall(y, rb_intern("=~"), 1, x);
2345     }
2346 }
2347
2348
2349 static VALUE get_pat(VALUE, int);
2350
2351
2352 /*
2353  *  call-seq:
2354  *     str.match(pattern)   => matchdata or nil
2355  *
2356  *  Converts <i>pattern</i> to a <code>Regexp</code> (if it isn't already one),
2357  *  then invokes its <code>match</code> method on <i>str</i>.  If the second
2358  *  parameter is present, it specifies the position in the string to begin the
2359  *  search.
2360  *
2361  *     'hello'.match('(.)\1')      #=> #<MatchData "ll" 1:"l">
2362  *     'hello'.match('(.)\1')[0]   #=> "ll"
2363  *     'hello'.match(/(.)\1/)[0]   #=> "ll"
2364  *     'hello'.match('xx')         #=> nil
2365  *
2366  *  If a block is given, invoke the block with MatchData if match succeed, so
2367  *  that you can write
2368  *
2369  *     str.match(pat) {|m| ...}
2370  *
2371  *  instead of
2372  *
2373  *     if m = str.match(pat)
2374  *       ...
2375  *     end
2376  *
2377  *  The return value is a value from block execution in this case.
2378  */
2379
2380 static VALUE
2381 rb_str_match_m(int argc, VALUE *argv, VALUE str)
2382 {
2383     VALUE re, result;
2384     if (argc < 1)
2385         rb_raise(rb_eArgError, "wrong number of arguments (%d for 1)", argc);
2386     re = argv[0];
2387     argv[0] = str;
2388     result = rb_funcall2(get_pat(re, 0), rb_intern("match"), argc, argv);
2389     if (!NIL_P(result) && rb_block_given_p()) {
2390         return rb_yield(result);
2391     }
2392     return result;
2393 }
2394
2395 enum neighbor_char {
2396     NEIGHBOR_NOT_CHAR,
2397     NEIGHBOR_FOUND,
2398     NEIGHBOR_WRAPPED
2399 };
2400
2401 static enum neighbor_char
2402 enc_succ_char(char *p, int len, rb_encoding *enc)
2403 {
2404     int i, l;
2405     while (1) {
2406         for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
2407             p[i] = '\0';
2408         if (i < 0)
2409             return NEIGHBOR_WRAPPED;
2410         ++((unsigned char*)p)[i];
2411         l = rb_enc_precise_mbclen(p, p+len, enc);
2412         if (MBCLEN_CHARFOUND_P(l)) {
2413             l = MBCLEN_CHARFOUND_LEN(l);
2414             if (l == len) {
2415                 return NEIGHBOR_FOUND;
2416             }
2417             else {
2418                 memset(p+l, 0xff, len-l);
2419             }
2420         }
2421         if (MBCLEN_INVALID_P(l) && i < len-1) {
2422             int len2, l2;
2423             for (len2 = len-1; 0 < len2; len2--) {
2424                 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
2425                 if (!MBCLEN_INVALID_P(l2))
2426                     break;
2427             }
2428             memset(p+len2+1, 0xff, len-(len2+1));
2429         }
2430     }
2431 }
2432
2433 static enum neighbor_char
2434 enc_pred_char(char *p, int len, rb_encoding *enc)
2435 {
2436     int i, l;
2437     while (1) {
2438         for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
2439             p[i] = '\xff';
2440         if (i < 0)
2441             return NEIGHBOR_WRAPPED;
2442         --((unsigned char*)p)[i];
2443         l = rb_enc_precise_mbclen(p, p+len, enc);
2444         if (MBCLEN_CHARFOUND_P(l)) {
2445             l = MBCLEN_CHARFOUND_LEN(l);
2446             if (l == len) {
2447                 return NEIGHBOR_FOUND;
2448             }
2449             else {
2450                 memset(p+l, 0, len-l);
2451             }
2452         }
2453         if (MBCLEN_INVALID_P(l) && i < len-1) {
2454             int len2, l2;
2455             for (len2 = len-1; 0 < len2; len2--) {
2456                 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
2457                 if (!MBCLEN_INVALID_P(l2))
2458                     break;
2459             }
2460             memset(p+len2+1, 0, len-(len2+1));
2461         }
2462     }
2463 }
2464
2465 /*
2466   overwrite +p+ by succeeding letter in +enc+ and returns
2467   NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
2468   When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
2469   assuming each ranges are successive, and mbclen
2470   never change in each ranges.
2471   NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
2472   character.
2473  */
2474 static enum neighbor_char
2475 enc_succ_alnum_char(char *p, int len, rb_encoding *enc, char *carry)
2476 {
2477     enum neighbor_char ret;
2478     int c;
2479     int ctype;
2480     int range;
2481     char save[ONIGENC_CODE_TO_MBC_MAXLEN];
2482
2483     c = rb_enc_mbc_to_codepoint(p, p+len, enc);
2484     if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
2485         ctype = ONIGENC_CTYPE_DIGIT;
2486     else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
2487         ctype = ONIGENC_CTYPE_ALPHA;
2488     else
2489         return NEIGHBOR_NOT_CHAR;
2490
2491     MEMCPY(save, p, char, len);
2492     ret = enc_succ_char(p, len, enc);
2493     if (ret == NEIGHBOR_FOUND) {
2494         c = rb_enc_mbc_to_codepoint(p, p+len, enc);
2495         if (rb_enc_isctype(c, ctype, enc))
2496             return NEIGHBOR_FOUND;
2497     }
2498     MEMCPY(p, save, char, len);
2499     range = 1;
2500     while (1) {
2501         MEMCPY(save, p, char, len);
2502         ret = enc_pred_char(p, len, enc);
2503         if (ret == NEIGHBOR_FOUND) {
2504             c = rb_enc_mbc_to_codepoint(p, p+len, enc);
2505             if (!rb_enc_isctype(c, ctype, enc)) {
2506                 MEMCPY(p, save, char, len);
2507                 break;
2508             }
2509         }
2510         else {
2511             MEMCPY(p, save, char, len);
2512             break;
2513         }
2514         range++;
2515     }
2516     if (range == 1) {
2517         return NEIGHBOR_NOT_CHAR;
2518     }
2519
2520     if (ctype != ONIGENC_CTYPE_DIGIT) {
2521         MEMCPY(carry, p, char, len);
2522         return NEIGHBOR_WRAPPED;
2523     }
2524
2525     MEMCPY(carry, p, char, len);
2526     enc_succ_char(carry, len, enc);
2527     return NEIGHBOR_WRAPPED;
2528 }
2529
2530
2531 /*
2532  *  call-seq:
2533  *     str.succ   => new_str
2534  *     str.next   => new_str
2535  *
2536  *  Returns the successor to <i>str</i>. The successor is calculated by
2537  *  incrementing characters starting from the rightmost alphanumeric (or
2538  *  the rightmost character if there are no alphanumerics) in the
2539  *  string. Incrementing a digit always results in another digit, and
2540  *  incrementing a letter results in another letter of the same case.
2541  *  Incrementing nonalphanumerics uses the underlying character set's
2542  *  collating sequence.
2543  *
2544  *  If the increment generates a ``carry,'' the character to the left of
2545  *  it is incremented. This process repeats until there is no carry,
2546  *  adding an additional character if necessary.
2547  *
2548  *     "abcd".succ        #=> "abce"
2549  *     "THX1138".succ     #=> "THX1139"
2550  *     "<<koala>>".succ   #=> "<<koalb>>"
2551  *     "1999zzz".succ     #=> "2000aaa"
2552  *     "ZZZ9999".succ     #=> "AAAA0000"
2553  *     "***".succ         #=> "**+"
2554  */
2555
2556 VALUE
2557 rb_str_succ(VALUE orig)
2558 {
2559     rb_encoding *enc;
2560     VALUE str;
2561     char *sbeg, *s, *e, *last_alnum = 0;
2562     int c = -1;
2563     long l;
2564     char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
2565     int carry_pos = 0, carry_len = 1;
2566     enum neighbor_char neighbor = NEIGHBOR_FOUND;
2567
2568     str = rb_str_new5(orig, RSTRING_PTR(orig), RSTRING_LEN(orig));
2569     rb_enc_cr_str_copy_for_substr(str, orig);
2570     OBJ_INFECT(str, orig);
2571     if (RSTRING_LEN(str) == 0) return str;
2572
2573     enc = STR_ENC_GET(orig);
2574     sbeg = RSTRING_PTR(str);
2575     s = e = sbeg + RSTRING_LEN(str);
2576
2577     while ((s = rb_enc_prev_char(sbeg, s, enc)) != 0) {
2578         if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
2579             if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
2580                 ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
2581                 s = last_alnum;
2582                 break;
2583             }
2584         }
2585         if ((l = rb_enc_precise_mbclen(s, e, enc)) <= 0) continue;
2586         neighbor = enc_succ_alnum_char(s, l, enc, carry);
2587         switch (neighbor) {
2588           case NEIGHBOR_NOT_CHAR:
2589             continue;
2590           case NEIGHBOR_FOUND:
2591             return str;
2592           case NEIGHBOR_WRAPPED:
2593             last_alnum = s;
2594             break;
2595         }
2596         c = 1;
2597         carry_pos = s - sbeg;
2598         carry_len = l;
2599     }
2600     if (c == -1) {              /* str contains no alnum */
2601         s = e;
2602         while ((s = rb_enc_prev_char(sbeg, s, enc)) != 0) {
2603             enum neighbor_char neighbor;
2604             if ((l = rb_enc_precise_mbclen(s, e, enc)) <= 0) continue;
2605             neighbor = enc_succ_char(s, l, enc);
2606             if (neighbor == NEIGHBOR_FOUND)
2607                 return str;
2608             if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
2609                 /* wrapped to \0...\0.  search next valid char. */
2610                 enc_succ_char(s, l, enc);
2611             }
2612             if (!rb_enc_asciicompat(enc)) {
2613                 MEMCPY(carry, s, char, l);
2614                 carry_len = l;
2615             }
2616             carry_pos = s - sbeg;
2617         }
2618     }
2619     RESIZE_CAPA(str, RSTRING_LEN(str) + carry_len);
2620     s = RSTRING_PTR(str) + carry_pos;
2621     memmove(s + carry_len, s, RSTRING_LEN(str) - carry_pos);
2622     memmove(s, carry, carry_len);
2623     STR_SET_LEN(str, RSTRING_LEN(str) + carry_len);
2624     RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
2625     rb_enc_str_coderange(str);
2626     return str;
2627 }
2628
2629
2630 /*
2631  *  call-seq:
2632  *     str.succ!   => str
2633  *     str.next!   => str
2634  *
2635  *  Equivalent to <code>String#succ</code>, but modifies the receiver in
2636  *  place.
2637  */
2638
2639 static VALUE
2640 rb_str_succ_bang(VALUE str)
2641 {
2642     rb_str_shared_replace(str, rb_str_succ(str));
2643
2644     return str;
2645 }
2646
2647
2648 /*
2649  *  call-seq:
2650  *     str.upto(other_str, exclusive=false) {|s| block }   => str
2651  *
2652  *  Iterates through successive values, starting at <i>str</i> and
2653  *  ending at <i>other_str</i> inclusive, passing each value in turn to
2654  *  the block. The <code>String#succ</code> method is used to generate
2655  *  each value.  If optional second argument exclusive is omitted or is <code>false</code>,
2656  *  the last value will be included; otherwise it will be excluded.
2657  *
2658  *     "a8".upto("b6") {|s| print s, ' ' }
2659  *     for s in "a8".."b6"
2660  *       print s, ' '
2661  *     end
2662  *
2663  *  <em>produces:</em>
2664  *
2665  *     a8 a9 b0 b1 b2 b3 b4 b5 b6
2666  *     a8 a9 b0 b1 b2 b3 b4 b5 b6
2667  */
2668
2669 static VALUE
2670 rb_str_upto(int argc, VALUE *argv, VALUE beg)
2671 {
2672     VALUE end, exclusive;
2673     VALUE current, after_end;
2674     ID succ;
2675     int n, excl;
2676     rb_encoding *enc;
2677
2678     rb_scan_args(argc, argv, "11", &end, &exclusive);
2679     excl = RTEST(exclusive);
2680     CONST_ID(succ, "succ");
2681     StringValue(end);
2682     enc = rb_enc_check(beg, end);
2683     if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 &&
2684         is_ascii_string(beg) && is_ascii_string(end)) {
2685         char c = RSTRING_PTR(beg)[0];
2686         char e = RSTRING_PTR(end)[0];
2687
2688         if (c > e || (excl && c == e)) return beg;
2689         for (;;) {
2690             rb_yield(rb_enc_str_new(&c, 1, enc));
2691             if (!excl && c == e) break;
2692             c++;
2693             if (excl && c == e) break;
2694         }
2695         return beg;
2696     }
2697     n = rb_str_cmp(beg, end);
2698     if (n > 0 || (excl && n == 0)) return beg;
2699
2700     after_end = rb_funcall(end, succ, 0, 0);
2701     current = beg;
2702     while (!rb_str_equal(current, after_end)) {
2703         rb_yield(current);
2704         if (!excl && rb_str_equal(current, end)) break;
2705         current = rb_funcall(current, succ, 0, 0);
2706         StringValue(current);
2707         if (excl && rb_str_equal(current, end)) break;
2708         if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
2709             break;
2710     }
2711
2712     return beg;
2713 }
2714
2715 static VALUE
2716 rb_str_subpat(VALUE str, VALUE re, int nth)
2717 {
2718     if (rb_reg_search(re, str, 0, 0) >= 0) {
2719         return rb_reg_nth_match(nth, rb_backref_get());
2720     }
2721     return Qnil;
2722 }
2723
2724 static VALUE
2725 rb_str_aref(VALUE str, VALUE indx)
2726 {
2727     long idx;
2728
2729     switch (TYPE(indx)) {
2730       case T_FIXNUM:
2731         idx = FIX2LONG(indx);
2732
2733       num_index:
2734         str = rb_str_substr(str, idx, 1);
2735         if (!NIL_P(str) && RSTRING_LEN(str) == 0) return Qnil;
2736         return str;
2737
2738       case T_REGEXP:
2739         return rb_str_subpat(str, indx, 0);
2740
2741       case T_STRING:
2742         if (rb_str_index(str, indx, 0) != -1)
2743             return rb_str_dup(indx);
2744         return Qnil;
2745
2746       default:
2747         /* check if indx is Range */
2748         {
2749             long beg, len;
2750             VALUE tmp;
2751
2752             len = str_strlen(str, STR_ENC_GET(str));
2753             switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
2754               case Qfalse:
2755                 break;
2756               case Qnil:
2757                 return Qnil;
2758               default:
2759                 tmp = rb_str_substr(str, beg, len);
2760                 return tmp;
2761             }
2762         }
2763         idx = NUM2LONG(indx);
2764         goto num_index;
2765     }
2766     return Qnil;                /* not reached */
2767 }
2768
2769
2770 /*
2771  *  call-seq:
2772  *     str[fixnum]                 => new_str or nil
2773  *     str[fixnum, fixnum]         => new_str or nil
2774  *     str[range]                  => new_str or nil
2775  *     str[regexp]                 => new_str or nil
2776  *     str[regexp, fixnum]         => new_str or nil
2777  *     str[other_str]              => new_str or nil
2778  *     str.slice(fixnum)           => new_str or nil
2779  *     str.slice(fixnum, fixnum)   => new_str or nil
2780  *     str.slice(range)            => new_str or nil
2781  *     str.slice(regexp)           => new_str or nil
2782  *     str.slice(regexp, fixnum)   => new_str or nil
2783  *     str.slice(other_str)        => new_str or nil
2784  *
2785  *  Element Reference---If passed a single <code>Fixnum</code>, returns a
2786  *  substring of one character at that position. If passed two <code>Fixnum</code>
2787  *  objects, returns a substring starting at the offset given by the first, and
2788  *  a length given by the second. If given a range, a substring containing
2789  *  characters at offsets given by the range is returned. In all three cases, if
2790  *  an offset is negative, it is counted from the end of <i>str</i>. Returns
2791  *  <code>nil</code> if the initial offset falls outside the string, the length
2792  *  is negative, or the beginning of the range is greater than the end.
2793  *
2794  *  If a <code>Regexp</code> is supplied, the matching portion of <i>str</i> is
2795  *  returned. If a numeric parameter follows the regular expression, that
2796  *  component of the <code>MatchData</code> is returned instead. If a
2797  *  <code>String</code> is given, that string is returned if it occurs in
2798  *  <i>str</i>. In both cases, <code>nil</code> is returned if there is no
2799  *  match.
2800  *
2801  *     a = "hello there"
2802  *     a[1]                   #=> "e"
2803  *     a[1,3]                 #=> "ell"
2804  *     a[1..3]                #=> "ell"
2805  *     a[-3,2]                #=> "er"
2806  *     a[-4..-2]              #=> "her"
2807  *     a[12..-1]              #=> nil
2808  *     a[-2..-4]              #=> ""
2809  *     a[/[aeiou](.)\1/]      #=> "ell"
2810  *     a[/[aeiou](.)\1/, 0]   #=> "ell"
2811  *     a[/[aeiou](.)\1/, 1]   #=> "l"
2812  *     a[/[aeiou](.)\1/, 2]   #=> nil
2813  *     a["lo"]                #=> "lo"
2814  *     a["bye"]               #=> nil
2815  */
2816
2817 static VALUE
2818 rb_str_aref_m(int argc, VALUE *argv, VALUE str)
2819 {
2820     if (argc == 2) {
2821         if (TYPE(argv[0]) == T_REGEXP) {
2822             return rb_str_subpat(str, argv[0], NUM2INT(argv[1]));
2823         }
2824         return rb_str_substr(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]));
2825     }
2826     if (argc != 1) {
2827         rb_raise(rb_eArgError, "wrong number of arguments (%d for 1)", argc);
2828     }
2829     return rb_str_aref(str, argv[0]);
2830 }
2831
2832 VALUE
2833 rb_str_drop_bytes(VALUE str, long len)
2834 {
2835     char *ptr = RSTRING_PTR(str);
2836     long olen = RSTRING_LEN(str), nlen;
2837
2838     str_modifiable(str);
2839     if (len > olen) len = olen;
2840     nlen = olen - len;
2841     if (nlen <= RSTRING_EMBED_LEN_MAX) {
2842         char *oldptr = ptr;
2843         int fl = (RBASIC(str)->flags & (STR_NOEMBED|ELTS_SHARED));
2844         STR_SET_EMBED(str);
2845         STR_SET_EMBED_LEN(str, nlen);
2846         ptr = RSTRING(str)->as.ary;
2847         memcpy(ptr, oldptr + len, nlen);
2848         if (fl == STR_NOEMBED) xfree(oldptr);
2849     }
2850     else {
2851         if (!STR_SHARED_P(str)) rb_str_new4(str);
2852         ptr = RSTRING(str)->as.heap.ptr += len;
2853         RSTRING(str)->as.heap.len = nlen;
2854     }
2855     ptr[nlen] = 0;
2856     ENC_CODERANGE_CLEAR(str);
2857     return str;
2858 }
2859
2860 static void
2861 rb_str_splice_0(VALUE str, long beg, long len, VALUE val)
2862 {
2863     if (beg == 0 && RSTRING_LEN(val) == 0) {
2864         rb_str_drop_bytes(str, len);
2865         OBJ_INFECT(str, val);
2866         return;
2867     }
2868
2869     rb_str_modify(str);
2870     if (len < RSTRING_LEN(val)) {
2871         /* expand string */
2872         RESIZE_CAPA(str, RSTRING_LEN(str) + RSTRING_LEN(val) - len + 1);
2873     }
2874
2875     if (RSTRING_LEN(val) != len) {
2876         memmove(RSTRING_PTR(str) + beg + RSTRING_LEN(val),
2877                 RSTRING_PTR(str) + beg + len,
2878                 RSTRING_LEN(str) - (beg + len));
2879     }
2880     if (RSTRING_LEN(val) < beg && len < 0) {
2881         MEMZERO(RSTRING_PTR(str) + RSTRING_LEN(str), char, -len);
2882     }
2883     if (RSTRING_LEN(val) > 0) {
2884         memmove(RSTRING_PTR(str)+beg, RSTRING_PTR(val), RSTRING_LEN(val));
2885     }
2886     STR_SET_LEN(str, RSTRING_LEN(str) + RSTRING_LEN(val) - len);
2887     if (RSTRING_PTR(str)) {
2888         RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
2889     }
2890     OBJ_INFECT(str, val);
2891 }
2892
2893 static void
2894 rb_str_splice(VALUE str, long beg, long len, VALUE val)
2895 {
2896     long slen;
2897     char *p, *e;
2898     rb_encoding *enc;
2899     int singlebyte = single_byte_optimizable(str);
2900
2901     if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
2902
2903     StringValue(val);
2904     rb_str_modify(str);
2905     enc = rb_enc_check(str, val);
2906     slen = str_strlen(str, enc);
2907
2908     if (slen < beg) {
2909       out_of_range:
2910         rb_raise(rb_eIndexError, "index %ld out of string", beg);
2911     }
2912     if (beg < 0) {
2913         if (-beg > slen) {
2914             goto out_of_range;
2915         }
2916         beg += slen;
2917     }
2918     if (slen < len || slen < beg + len) {
2919         len = slen - beg;
2920     }
2921     p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
2922     if (!p) p = RSTRING_END(str);
2923     e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
2924     if (!e) e = RSTRING_END(str);
2925     /* error check */
2926     beg = p - RSTRING_PTR(str); /* physical position */
2927     len = e - p;                /* physical length */
2928     rb_str_splice_0(str, beg, len, val);
2929     rb_enc_associate(str, enc);
2930 }
2931
2932 void
2933 rb_str_update(VALUE str, long beg, long len, VALUE val)
2934 {
2935     rb_str_splice(str, beg, len, val);
2936 }
2937
2938 static void
2939 rb_str_subpat_set(VALUE str, VALUE re, int nth, VALUE val)
2940 {
2941     VALUE match;
2942     long start, end, len;
2943     rb_encoding *enc;
2944     struct re_registers *regs;
2945
2946     if (rb_reg_search(re, str, 0, 0) < 0) {
2947         rb_raise(rb_eIndexError, "regexp not matched");
2948     }
2949     match = rb_backref_get();
2950     regs = RMATCH_REGS(match);
2951     if (nth >= regs->num_regs) {
2952       out_of_range:
2953         rb_raise(rb_eIndexError, "index %d out of regexp", nth);
2954     }
2955     if (nth < 0) {
2956         if (-nth >= regs->num_regs) {
2957             goto out_of_range;
2958         }
2959         nth += regs->num_regs;
2960     }
2961
2962     start = BEG(nth);
2963     if (start == -1) {
2964         rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
2965     }
2966     end = END(nth);
2967     len = end - start;
2968     StringValue(val);
2969     enc = rb_enc_check(str, val);
2970     rb_str_splice_0(str, start, len, val);
2971     rb_enc_associate(str, enc);
2972 }
2973
2974 static VALUE
2975 rb_str_aset(VALUE str, VALUE indx, VALUE val)
2976 {
2977     long idx, beg;
2978
2979     switch (TYPE(indx)) {
2980       case T_FIXNUM:
2981         idx = FIX2LONG(indx);
2982       num_index:
2983         rb_str_splice(str, idx, 1, val);
2984         return val;
2985
2986       case T_REGEXP:
2987         rb_str_subpat_set(str, indx, 0, val);
2988         return val;
2989
2990       case T_STRING:
2991         beg = rb_str_index(str, indx, 0);
2992         if (beg < 0) {
2993             rb_raise(rb_eIndexError, "string not matched");
2994         }
2995         beg = rb_str_sublen(str, beg);
2996         rb_str_splice(str, beg, str_strlen(indx, 0), val);
2997         return val;
2998
2999       default:
3000         /* check if indx is Range */
3001         {
3002             long beg, len;
3003             if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, 0), 2)) {
3004                 rb_str_splice(str, beg, len, val);
3005                 return val;
3006             }
3007         }
3008         idx = NUM2LONG(indx);
3009         goto num_index;
3010     }
3011 }
3012
3013 /*
3014  *  call-seq:
3015  *     str[fixnum] = new_str
3016  *     str[fixnum, fixnum] = new_str
3017  *     str[range] = aString
3018  *     str[regexp] = new_str
3019  *     str[regexp, fixnum] = new_str
3020  *     str[other_str] = new_str
3021  *
3022  *  Element Assignment---Replaces some or all of the content of <i>str</i>. The
3023  *  portion of the string affected is determined using the same criteria as
3024  *  <code>String#[]</code>. If the replacement string is not the same length as
3025  *  the text it is replacing, the string will be adjusted accordingly. If the
3026  *  regular expression or string is used as the index doesn't match a position
3027  *  in the string, <code>IndexError</code> is raised. If the regular expression
3028  *  form is used, the optional second <code>Fixnum</code> allows you to specify
3029  *  which portion of the match to replace (effectively using the
3030  *  <code>MatchData</code> indexing rules. The forms that take a
3031  *  <code>Fixnum</code> will raise an <code>IndexError</code> if the value is
3032  *  out of range; the <code>Range</code> form will raise a
3033  *  <code>RangeError</code>, and the <code>Regexp</code> and <code>String</code>
3034  *  forms will silently ignore the assignment.
3035  */
3036
3037 static VALUE
3038 rb_str_aset_m(int argc, VALUE *argv, VALUE str)
3039 {
3040     if (argc == 3) {
3041         if (TYPE(argv[0]) == T_REGEXP) {
3042             rb_str_subpat_set(str, argv[0], NUM2INT(argv[1]), argv[2]);
3043         }
3044         else {
3045             rb_str_splice(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
3046         }
3047         return argv[2];
3048     }
3049     if (argc != 2) {
3050         rb_raise(rb_eArgError, "wrong number of arguments (%d for 2)", argc);
3051     }
3052     return rb_str_aset(str, argv[0], argv[1]);
3053 }
3054
3055 /*
3056  *  call-seq:
3057  *     str.insert(index, other_str)   => str
3058  *
3059  *  Inserts <i>other_str</i> before the character at the given
3060  *  <i>index</i>, modifying <i>str</i>. Negative indices count from the
3061  *  end of the string, and insert <em>after</em> the given character.
3062  *  The intent is insert <i>aString</i> so that it starts at the given
3063  *  <i>index</i>.
3064  *
3065  *     "abcd".insert(0, 'X')    #=> "Xabcd"
3066  *     "abcd".insert(3, 'X')    #=> "abcXd"
3067  *     "abcd".insert(4, 'X')    #=> "abcdX"
3068  *     "abcd".insert(-3, 'X')   #=> "abXcd"
3069  *     "abcd".insert(-1, 'X')   #=> "abcdX"
3070  */
3071
3072 static VALUE
3073 rb_str_insert(VALUE str, VALUE idx, VALUE str2)
3074 {
3075     long pos = NUM2LONG(idx);
3076
3077     if (pos == -1) {
3078         return rb_str_append(str, str2);
3079     }
3080     else if (pos < 0) {
3081         pos++;
3082     }
3083     rb_str_splice(str, pos, 0, str2);
3084     return str;
3085 }
3086
3087
3088 /*
3089  *  call-seq:
3090  *     str.slice!(fixnum)           => fixnum or nil
3091  *     str.slice!(fixnum, fixnum)   => new_str or nil
3092  *     str.slice!(range)            => new_str or nil
3093  *     str.slice!(regexp)           => new_str or nil
3094  *     str.slice!(other_str)        => new_str or nil
3095  *
3096  *  Deletes the specified portion from <i>str</i>, and returns the portion
3097  *  deleted.
3098  *
3099  *     string = "this is a string"
3100  *     string.slice!(2)        #=> "i"
3101  *     string.slice!(3..6)     #=> " is "
3102  *     string.slice!(/s.*t/)   #=> "sa st"
3103  *     string.slice!("r")      #=> "r"
3104  *     string                  #=> "thing"
3105  */
3106
3107 static VALUE
3108 rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
3109 {
3110     VALUE result;
3111     VALUE buf[3];
3112     int i;
3113
3114     if (argc < 1 || 2 < argc) {
3115         rb_raise(rb_eArgError, "wrong number of arguments (%d for 1)", argc);
3116     }
3117     for (i=0; i<argc; i++) {
3118         buf[i] = argv[i];
3119     }
3120     rb_str_modify(str);
3121     buf[i] = rb_str_new(0,0);
3122     result = rb_str_aref_m(argc, buf, str);
3123     if (!NIL_P(result)) {
3124         rb_str_aset_m(argc+1, buf, str);
3125     }
3126     return result;
3127 }
3128
3129 static VALUE
3130 get_pat(VALUE pat, int quote)
3131 {
3132     VALUE val;
3133
3134     switch (TYPE(pat)) {
3135       case T_REGEXP:
3136         return pat;
3137
3138       case T_STRING:
3139         break;
3140
3141       default:
3142         val = rb_check_string_type(pat);
3143         if (NIL_P(val)) {
3144             Check_Type(pat, T_REGEXP);
3145         }
3146         pat = val;
3147     }
3148
3149     if (quote) {
3150         pat = rb_reg_quote(pat);
3151     }
3152
3153     return rb_reg_regcomp(pat);
3154 }
3155
3156
3157 /*
3158  *  call-seq:
3159  *     str.sub!(pattern, replacement)          => str or nil
3160  *     str.sub!(pattern) {|match| block }      => str or nil
3161  *
3162  *  Performs the substitutions of <code>String#sub</code> in place,
3163  *  returning <i>str</i>, or <code>nil</code> if no substitutions were
3164  *  performed.
3165  */
3166
3167 static VALUE
3168 rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
3169 {
3170     VALUE pat, repl, hash = Qnil;
3171     int iter = 0;
3172     int tainted = 0;
3173     long plen;
3174
3175     if (argc == 1 && rb_block_given_p()) {
3176         iter = 1;
3177     }
3178     else if (argc == 2) {
3179         repl = argv[1];
3180         hash = rb_check_convert_type(argv[1], T_HASH, "Hash", "to_hash");
3181         if (NIL_P(hash)) {
3182             StringValue(repl);
3183         }
3184         if (OBJ_TAINTED(repl)) tainted = 1;
3185     }
3186     else {
3187         rb_raise(rb_eArgError, "wrong number of arguments (%d for 2)", argc);
3188     }
3189
3190     pat = get_pat(argv[0], 1);
3191     if (rb_reg_search(pat, str, 0, 0) >= 0) {
3192         rb_encoding *enc;
3193         int cr = ENC_CODERANGE(str);
3194         VALUE match = rb_backref_get();
3195         struct re_registers *regs = RMATCH_REGS(match);
3196         long beg0 = BEG(0);
3197         long end0 = END(0);
3198
3199         if (iter || !NIL_P(hash)) {
3200             char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
3201
3202             if (iter) {
3203                 repl = rb_obj_as_string(rb_yield(rb_reg_nth_match(0, match)));
3204             }
3205             else {
3206                 repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
3207                 repl = rb_obj_as_string(repl);
3208             }
3209             str_mod_check(str, p, len);
3210             str_frozen_check(str);
3211         }
3212         else {
3213             repl = rb_reg_regsub(repl, str, regs, pat);
3214         }
3215         enc = rb_enc_compatible(str, repl);
3216         if (!enc) {
3217             rb_encoding *str_enc = STR_ENC_GET(str);
3218             if (coderange_scan(RSTRING_PTR(str), beg0, str_enc) != ENC_CODERANGE_7BIT ||
3219                 coderange_scan(RSTRING_PTR(str)+end0,
3220                                RSTRING_LEN(str)-end0, str_enc) != ENC_CODERANGE_7BIT) {
3221                 rb_raise(rb_eArgError, "character encodings differ: %s and %s",
3222                          rb_enc_name(str_enc),
3223                          rb_enc_name(STR_ENC_GET(repl)));
3224             }
3225             enc = STR_ENC_GET(repl);
3226         }
3227         rb_str_modify(str);
3228         rb_enc_associate(str, enc);
3229         if (OBJ_TAINTED(repl)) tainted = 1;
3230         if (ENC_CODERANGE_UNKNOWN < cr && cr < ENC_CODERANGE_BROKEN) {
3231             int cr2 = ENC_CODERANGE(repl);
3232             if (cr2 == ENC_CODERANGE_UNKNOWN || cr2 > cr) cr = cr2;
3233         }
3234         plen = end0 - beg0;
3235         if (RSTRING_LEN(repl) > plen) {
3236             RESIZE_CAPA(str, RSTRING_LEN(str) + RSTRING_LEN(repl) - plen);
3237         }
3238         if (RSTRING_LEN(repl) != plen) {
3239             memmove(RSTRING_PTR(str) + beg0 + RSTRING_LEN(repl),
3240                     RSTRING_PTR(str) + beg0 + plen,
3241                     RSTRING_LEN(str) - beg0 - plen);
3242         }
3243         memcpy(RSTRING_PTR(str) + beg0,
3244                RSTRING_PTR(repl), RSTRING_LEN(repl));
3245         STR_SET_LEN(str, RSTRING_LEN(str) + RSTRING_LEN(repl) - plen);
3246         RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
3247         ENC_CODERANGE_SET(str, cr);
3248         if (tainted) OBJ_TAINT(str);
3249
3250         return str;
3251     }
3252     return Qnil;
3253 }
3254
3255
3256 /*
3257  *  call-seq:
3258  *     str.sub(pattern, replacement)         => new_str
3259  *     str.sub(pattern) {|match| block }     => new_str
3260  *
3261  *  Returns a copy of <i>str</i> with the <em>first</em> occurrence of
3262  *  <i>pattern</i> replaced with either <i>replacement</i> or the value of the
3263  *  block. The <i>pattern</i> will typically be a <code>Regexp</code>; if it is
3264  *  a <code>String</code> then no regular expression metacharacters will be
3265  *  interpreted (that is <code>/\d/</code> will match a digit, but
3266  *  <code>'\d'</code> will match a backslash followed by a 'd').
3267  *
3268  *  If the method call specifies <i>replacement</i>, special variables such as
3269  *  <code>$&</code> will not be useful, as substitution into the string occurs
3270  *  before the pattern match starts. However, the sequences <code>\1</code>,
3271  *  <code>\2</code>, <code>\k<group_name></code>, etc., may be used.
3272  *
3273  *  In the block form, the current match string is passed in as a parameter, and
3274  *  variables such as <code>$1</code>, <code>$2</code>, <code>$`</code>,
3275  *  <code>$&</code>, and <code>$'</code> will be set appropriately. The value
3276  *  returned by the block will be substituted for the match on each call.
3277  *
3278  *  The result inherits any tainting in the original string or any supplied
3279  *  replacement string.
3280  *
3281  *     "hello".sub(/[aeiou]/, '*')                  #=> "h*llo"
3282  *     "hello".sub(/([aeiou])/, '<\1>')             #=> "h<e>llo"
3283  *     "hello".sub(/./) {|s| s[0].ord.to_s + ' ' }  #=> "104 ello"
3284  *     "hello".sub(/(?<foo>[aeiou])/, '*\k<foo>*')  #=> "h*e*llo"
3285  */
3286
3287 static VALUE
3288 rb_str_sub(int argc, VALUE *argv, VALUE str)
3289 {
3290     str = rb_str_dup(str);
3291     rb_str_sub_bang(argc, argv, str);
3292     return str;
3293 }
3294
3295 static VALUE
3296 str_gsub(int argc, VALUE *argv, VALUE str, int bang)
3297 {
3298     VALUE pat, val, repl, match, dest, hash = Qnil;
3299     struct re_registers *regs;
3300     long beg, n;
3301     long beg0, end0;
3302     long offset, blen, slen, len, last;
3303     int iter = 0;
3304     char *sp, *cp;
3305     int tainted = 0;
3306     rb_encoding *str_enc;
3307
3308     switch (argc) {
3309       case 1:
3310         RETURN_ENUMERATOR(str, argc, argv);
3311         iter = 1;
3312         break;
3313       case 2:
3314         repl = argv[1];
3315         hash = rb_check_convert_type(argv[1], T_HASH, "Hash", "to_hash");
3316         if (NIL_P(hash)) {
3317             StringValue(repl);
3318         }
3319         if (OBJ_TAINTED(repl)) tainted = 1;
3320         break;
3321       default:
3322         rb_raise(rb_eArgError, "wrong number of arguments (%d for 2)", argc);
3323     }
3324
3325     pat = get_pat(argv[0], 1);
3326     beg = rb_reg_search(pat, str, 0, 0);
3327     if (beg < 0) {
3328         if (bang) return Qnil;  /* no match, no substitution */
3329         return rb_str_dup(str);
3330     }
3331
3332     offset = 0;
3333     n = 0;
3334     blen = RSTRING_LEN(str) + 30; /* len + margin */
3335     dest = rb_str_buf_new(blen);
3336     sp = RSTRING_PTR(str);
3337     slen = RSTRING_LEN(str);
3338     cp = sp;
3339     str_enc = STR_ENC_GET(str);
3340
3341     do {
3342         n++;
3343         match = rb_backref_get();
3344         regs = RMATCH_REGS(match);
3345         beg0 = BEG(0);
3346         end0 = END(0);
3347         if (iter || !NIL_P(hash)) {
3348             if (iter) {
3349                 val = rb_obj_as_string(rb_yield(rb_reg_nth_match(0, match)));
3350             }
3351             else {
3352                 val = rb_hash_aref(hash, rb_str_subseq(str, BEG(0), END(0) - BEG(0)));
3353                 val = rb_obj_as_string(val);
3354             }
3355             str_mod_check(str, sp, slen);
3356             if (bang) str_frozen_check(str);
3357             if (val == dest) {  /* paranoid check [ruby-dev:24827] */
3358                 rb_raise(rb_eRuntimeError, "block should not cheat");
3359             }
3360         }
3361         else {
3362             val = rb_reg_regsub(repl, str, regs, pat);
3363         }
3364
3365         if (OBJ_TAINTED(val)) tainted = 1;
3366
3367         len = beg - offset;     /* copy pre-match substr */
3368         if (len) {
3369             rb_enc_str_buf_cat(dest, cp, len, str_enc);
3370         }
3371
3372         rb_str_buf_append(dest, val);
3373
3374         last = offset;
3375         offset = end0;
3376         if (beg0 == end0) {
3377             /*
3378              * Always consume at least one character of the input string
3379              * in order to prevent infinite loops.
3380              */
3381             if (RSTRING_LEN(str) <= end0) break;
3382             len = rb_enc_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
3383             rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
3384             offset = end0 + len;
3385         }
3386         cp = RSTRING_PTR(str) + offset;
3387         if (offset > RSTRING_LEN(str)) break;
3388         beg = rb_reg_search(pat, str, offset, 0);
3389     } while (beg >= 0);
3390     if (RSTRING_LEN(str) > offset) {
3391         rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
3392     }
3393     rb_reg_search(pat, str, last, 0);
3394     if (bang) {
3395         rb_str_shared_replace(str, dest);
3396     }
3397     else {
3398         RBASIC(dest)->klass = rb_obj_class(str);
3399         OBJ_INFECT(dest, str);
3400         str = dest;
3401     }
3402
3403     if (tainted) OBJ_TAINT(str);
3404     return str;
3405 }
3406
3407
3408 /*
3409  *  call-seq:
3410  *     str.gsub!(pattern, replacement)        => str or nil
3411  *     str.gsub!(pattern) {|match| block }    => str or nil
3412  *
3413  *  Performs the substitutions of <code>String#gsub</code> in place, returning
3414  *  <i>str</i>, or <code>nil</code> if no substitutions were performed.
3415  */
3416
3417 static VALUE
3418 rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
3419 {
3420     return str_gsub(argc, argv, str, 1);
3421 }
3422
3423
3424 /*
3425  *  call-seq:
3426  *     str.gsub(pattern, replacement)       => new_str
3427  *     str.gsub(pattern) {|match| block }   => new_str
3428  *
3429  *  Returns a copy of <i>str</i> with <em>all</em> occurrences of <i>pattern</i>
3430  *  replaced with either <i>replacement</i> or the value of the block. The
3431  *  <i>pattern</i> will typically be a <code>Regexp</code>; if it is a
3432  *  <code>String</code> then no regular expression metacharacters will be
3433  *  interpreted (that is <code>/\d/</code> will match a digit, but
3434  *  <code>'\d'</code> will match a backslash followed by a 'd').
3435  *
3436  *  If a string is used as the replacement, special variables from the match
3437  *  (such as <code>$&</code> and <code>$1</code>) cannot be substituted into it,
3438  *  as substitution into the string occurs before the pattern match
3439  *  starts. However, the sequences <code>\1</code>, <code>\2</code>,
3440  *  <code>\k<group_name></code>, and so on may be used to interpolate
3441  *  successive groups in the match.
3442  *
3443  *  In the block form, the current match string is passed in as a parameter, and
3444  *  variables such as <code>$1</code>, <code>$2</code>, <code>$`</code>,
3445  *  <code>$&</code>, and <code>$'</code> will be set appropriately. The value
3446  *  returned by the block will be substituted for the match on each call.
3447  *
3448  *  The result inherits any tainting in the original string or any supplied
3449  *  replacement string.
3450  *
3451  *     "hello".gsub(/[aeiou]/, '*')                  #=> "h*ll*"
3452  *     "hello".gsub(/([aeiou])/, '<\1>')             #=> "h<e>ll<o>"
3453  *     "hello".gsub(/./) {|s| s[0].ord.to_s + ' '}   #=> "104 101 108 108 111 "
3454  *     "hello".gsub(/(?<foo>[aeiou])/, '{\k<foo>}')  #=> "h{e}ll{o}"
3455  */
3456
3457 static VALUE
3458 rb_str_gsub(int argc, VALUE *argv, VALUE str)
3459 {
3460     return str_gsub(argc, argv, str, 0);
3461 }
3462
3463
3464 /*
3465  *  call-seq:
3466  *     str.replace(other_str)   => str
3467  *
3468  *  Replaces the contents and taintedness of <i>str</i> with the corresponding
3469  *  values in <i>other_str</i>.
3470  *
3471  *     s = "hello"         #=> "hello"
3472  *     s.replace "world"   #=> "world"
3473  */
3474
3475 static VALUE
3476 rb_str_replace(VALUE str, VALUE str2)
3477 {
3478     long len;
3479     if (str == str2) return str;
3480
3481     StringValue(str2);
3482     len = RSTRING_LEN(str2);
3483     if (STR_ASSOC_P(str2)) {
3484         str2 = rb_str_new4(str2);
3485     }
3486     if (str_independent(str) && !STR_EMBED_P(str)) {
3487         xfree(RSTRING_PTR(str));
3488     }
3489     if (STR_SHARED_P(str2)) {
3490         STR_SET_NOEMBED(str);
3491         RSTRING(str)->as.heap.len = len;
3492         RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
3493         FL_SET(str, ELTS_SHARED);
3494         FL_UNSET(str, STR_ASSOC);
3495         RSTRING(str)->as.heap.aux.shared = RSTRING(str2)->as.heap.aux.shared;
3496     }
3497     else {
3498         str_replace_shared(str, rb_str_new4(str2));
3499     }
3500
3501     OBJ_INFECT(str, str2);
3502     rb_enc_cr_str_exact_copy(str, str2);
3503     return str;
3504 }
3505
3506 /*
3507  *  call-seq:
3508  *     string.clear    ->  string
3509  *
3510  *  Makes string empty.
3511  *
3512  *     a = "abcde"
3513  *     a.clear    #=> ""
3514  */
3515
3516 static VALUE
3517 rb_str_clear(VALUE str)
3518 {
3519     /* rb_str_modify() */       /* no need for str_make_independent */
3520     if (str_independent(str) && !STR_EMBED_P(str)) {
3521         xfree(RSTRING_PTR(str));
3522     }
3523     STR_SET_EMBED(str);
3524     STR_SET_EMBED_LEN(str, 0);
3525     RSTRING_PTR(str)[0] = 0;
3526     ENC_CODERANGE_CLEAR(str);
3527     return str;
3528 }
3529
3530 /*
3531  *  call-seq:
3532  *     string.chr    ->  string
3533  *
3534  *  Returns a one-character string at the beginning of the string.
3535  *
3536  *     a = "abcde"
3537  *     a.chr    #=> "a"
3538  */
3539
3540 static VALUE
3541 rb_str_chr(VALUE str)
3542 {
3543     return rb_str_substr(str, 0, 1);
3544 }
3545
3546 /*
3547  *  call-seq:
3548  *     str.getbyte(index)          => 0 .. 255
3549  *
3550  *  returns the <i>index</i>th byte as an integer.
3551  */
3552 static VALUE
3553 rb_str_getbyte(VALUE str, VALUE index)
3554 {
3555     long pos = NUM2LONG(index);
3556
3557     if (pos < 0)
3558         pos += RSTRING_LEN(str);
3559     if (pos < 0 ||  RSTRING_LEN(str) <= pos)
3560         return Qnil;
3561
3562     return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
3563 }
3564
3565 /*
3566  *  call-seq:
3567  *     str.setbyte(index, int) => int
3568  *
3569  *  modifies the <i>index</i>th byte as <i>int</i>.
3570  */
3571 static VALUE
3572 rb_str_setbyte(VALUE str, VALUE index, VALUE value)
3573 {
3574     long pos = NUM2LONG(index);
3575     int byte = NUM2INT(value);
3576
3577     rb_str_modify(str);
3578
3579     if (pos < -RSTRING_LEN(str) || RSTRING_LEN(str) <= pos)
3580         rb_raise(rb_eIndexError, "index %ld out of string", pos);
3581     if (pos < 0)
3582         pos += RSTRING_LEN(str);
3583
3584     RSTRING_PTR(str)[pos] = byte;
3585
3586     return value;
3587 }
3588
3589 /*
3590  *  call-seq:
3591  *     str.reverse   => new_str
3592  *
3593  *  Returns a new string with the characters from <i>str</i> in reverse order.
3594  *
3595  *     "stressed".reverse   #=> "desserts"
3596  */
3597
3598 static VALUE
3599 rb_str_reverse(VALUE str)
3600 {
3601     rb_encoding *enc;
3602     VALUE rev;
3603     char *s, *e, *p;
3604     int single = 1;
3605
3606     if (RSTRING_LEN(str) <= 1) return rb_str_dup(str);
3607     enc = STR_ENC_GET(str);
3608     rev = rb_str_new5(str, 0, RSTRING_LEN(str));
3609     s = RSTRING_PTR(str); e = RSTRING_END(str);
3610     p = RSTRING_END(rev);
3611
3612     if (RSTRING_LEN(str) > 1) {
3613         if (single_byte_optimizable(str)) {
3614             while (s < e) {
3615                 *--p = *s++;
3616             }
3617         }
3618         else {
3619             while (s < e) {
3620                 int clen = rb_enc_mbclen(s, e, enc);
3621
3622                 if (clen > 1 || (*s & 0x80)) single = 0;
3623                 p -= clen;
3624                 memcpy(p, s, clen);
3625                 s += clen;
3626             }
3627         }
3628     }
3629     STR_SET_LEN(rev, RSTRING_LEN(str));
3630     OBJ_INFECT(rev, str);
3631     if (ENC_CODERANGE(str) == ENC_CODERANGE_UNKNOWN) {
3632         if (single) {
3633             ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
3634         }
3635         else {
3636             ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
3637         }
3638     }
3639     rb_enc_cr_str_copy_for_substr(rev, str);
3640
3641     return rev;
3642 }
3643
3644
3645 /*
3646  *  call-seq:
3647  *     str.reverse!   => str
3648  *
3649  *  Reverses <i>str</i> in place.
3650  */
3651
3652 static VALUE
3653 rb_str_reverse_bang(VALUE str)
3654 {
3655     if (RSTRING_LEN(str) > 1) {
3656         if (single_byte_optimizable(str)) {
3657             char *s, *e, c;
3658             int cr = ENC_CODERANGE(str);
3659             int single = 1;
3660
3661             rb_str_modify(str);
3662             s = RSTRING_PTR(str);
3663             e = RSTRING_END(str) - 1;
3664             while (s < e) {
3665                 c = *s;
3666                 if (*s & 0x80) single = 0;
3667                 *s++ = *e;
3668                 *e-- = c;
3669             }
3670             if (cr == ENC_CODERANGE_UNKNOWN && single) {
3671                 cr = ENC_CODERANGE_7BIT;
3672             }
3673             ENC_CODERANGE_SET(str, cr);
3674         }
3675         else {
3676             rb_str_shared_replace(str, rb_str_reverse(str));
3677         }
3678     }
3679     return str;
3680 }
3681
3682
3683 /*
3684  *  call-seq:
3685  *     str.include? other_str   => true or false
3686  *
3687  *  Returns <code>true</code> if <i>str</i> contains the given string or
3688  *  character.
3689  *
3690  *     "hello".include? "lo"   #=> true
3691  *     "hello".include? "ol"   #=> false
3692  *     "hello".include? ?h     #=> true
3693  */
3694
3695 static VALUE
3696 rb_str_include(VALUE str, VALUE arg)
3697 {
3698     long i;
3699
3700     StringValue(arg);
3701     i = rb_str_index(str, arg, 0);
3702
3703     if (i == -1) return Qfalse;
3704     return Qtrue;
3705 }
3706
3707
3708 /*
3709  *  call-seq:
3710  *     str.to_i(base=10)   => integer
3711  *
3712  *  Returns the result of interpreting leading characters in <i>str</i> as an
3713  *  integer base <i>base</i> (between 2 and 36). Extraneous characters past the
3714  *  end of a valid number are ignored. If there is not a valid number at the
3715  *  start of <i>str</i>, <code>0</code> is returned. This method never raises an
3716  *  exception.
3717  *
3718  *     "12345".to_i             #=> 12345
3719  *     "99 red balloons".to_i   #=> 99
3720  *     "0a".to_i                #=> 0
3721  *     "0a".to_i(16)            #=> 10
3722  *     "hello".to_i             #=> 0
3723  *     "1100101".to_i(2)        #=> 101
3724  *     "1100101".to_i(8)        #=> 294977
3725  *     "1100101".to_i(10)       #=> 1100101
3726  *     "1100101".to_i(16)       #=> 17826049
3727  */
3728
3729 static VALUE
3730 rb_str_to_i(int argc, VALUE *argv, VALUE str)
3731 {
3732     int base;
3733
3734     if (argc == 0) base = 10;
3735     else {
3736         VALUE b;
3737
3738         rb_scan_args(argc, argv, "01", &b);
3739         base = NUM2INT(b);
3740     }
3741     if (base < 0) {
3742         rb_raise(rb_eArgError, "invalid radix %d", base);
3743     }
3744     return rb_str_to_inum(str, base, Qfalse);
3745 }
3746
3747
3748 /*
3749  *  call-seq:
3750  *     str.to_f   => float
3751  *
3752  *  Returns the result of interpreting leading characters in <i>str</i> as a
3753  *  floating point number. Extraneous characters past the end of a valid number
3754  *  are ignored. If there is not a valid number at the start of <i>str</i>,
3755  *  <code>0.0</code> is returned. This method never raises an exception.
3756  *
3757  *     "123.45e1".to_f        #=> 1234.5
3758  *     "45.67 degrees".to_f   #=> 45.67
3759  *     "thx1138".to_f         #=> 0.0
3760  */
3761
3762 static VALUE
3763 rb_str_to_f(VALUE str)
3764 {
3765     return DOUBLE2NUM(rb_str_to_dbl(str, Qfalse));
3766 }
3767
3768
3769 /*
3770  *  call-seq:
3771  *     str.to_s     => str
3772  *     str.to_str   => str
3773  *
3774  *  Returns the receiver.
3775  */
3776
3777 static VALUE
3778 rb_str_to_s(VALUE str)
3779 {
3780     if (rb_obj_class(str) != rb_cString) {
3781         VALUE dup = str_alloc(rb_cString);
3782         rb_str_replace(dup, str);
3783         return dup;
3784     }
3785     return str;
3786 }
3787
3788 static void
3789 str_cat_char(VALUE str, int c, rb_encoding *enc)
3790 {
3791     char s[16];
3792     int n = rb_enc_codelen(c, enc);
3793
3794     rb_enc_mbcput(c, s, enc);
3795     rb_enc_str_buf_cat(str, s, n, enc);
3796 }
3797
3798 static void
3799 prefix_escape(VALUE str, int c, rb_encoding *enc)
3800 {
3801     str_cat_char(str, '\\', enc);
3802     str_cat_char(str, c, enc);
3803 }
3804
3805 /*
3806  * call-seq:
3807  *   str.inspect   => string
3808  *
3809  * Returns a printable version of _str_, surrounded by quote marks,
3810  * with special characters escaped.
3811  *
3812  *    str = "hello"
3813  *    str[3] = "\b"
3814  *    str.inspect       #=> "\"hel\\bo\""
3815  */
3816
3817 VALUE
3818 rb_str_inspect(VALUE str)
3819 {
3820     rb_encoding *enc = STR_ENC_GET(str);
3821     char *p, *pend;
3822     VALUE result = rb_str_buf_new(0);
3823
3824     if (!rb_enc_asciicompat(enc)) enc = rb_usascii_encoding();
3825     rb_enc_associate(result, enc);
3826     str_cat_char(result, '"', enc);
3827     p = RSTRING_PTR(str); pend = RSTRING_END(str);
3828     while (p < pend) {
3829         int c;
3830         int n;
3831         int cc;
3832
3833         n = rb_enc_precise_mbclen(p, pend, enc);
3834         if (!MBCLEN_CHARFOUND_P(n)) {
3835             p++;
3836             n = 1;
3837             goto escape_codepoint;
3838         }
3839         n = MBCLEN_CHARFOUND_LEN(n);
3840
3841         c = rb_enc_codepoint(p, pend, enc);
3842         n = rb_enc_codelen(c, enc);
3843
3844         p += n;
3845         if (c == '"'|| c == '\\' ||
3846             (c == '#' &&
3847              p < pend &&
3848              MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p,pend,enc)) &&
3849              (cc = rb_enc_codepoint(p,pend,enc),
3850               (cc == '$' || cc == '@' || cc == '{')))) {
3851             prefix_escape(result, c, enc);
3852         }
3853         else if (c == '\n') {
3854             prefix_escape(result, 'n', enc);
3855         }
3856         else if (c == '\r') {
3857             prefix_escape(result, 'r', enc);
3858         }
3859         else if (c == '\t') {
3860             prefix_escape(result, 't', enc);
3861         }
3862         else if (c == '\f') {
3863             prefix_escape(result, 'f', enc);
3864         }
3865         else if (c == '\013') {
3866             prefix_escape(result, 'v', enc);
3867         }
3868         else if (c == '\010') {
3869             prefix_escape(result, 'b', enc);
3870         }
3871         else if (c == '\007') {
3872             prefix_escape(result, 'a', enc);
3873         }
3874         else if (c == 033) {
3875             prefix_escape(result, 'e', enc);
3876         }
3877         else if (rb_enc_isprint(c, enc)) {
3878             rb_enc_str_buf_cat(result, p-n, n, enc);
3879         }
3880         else {
3881             char buf[5];
3882             char *s;
3883             char *q;
3884
3885           escape_codepoint:
3886             for (q = p-n; q < p; q++) {
3887                 s = buf;
3888                 sprintf(buf, "\\x%02X", *q & 0377);
3889                 while (*s) {
3890                     str_cat_char(result, *s++, enc);
3891                 }
3892             }
3893         }
3894     }
3895     str_cat_char(result, '"', enc);
3896
3897     OBJ_INFECT(result, str);
3898     return result;
3899 }
3900
3901 #define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
3902
3903 /*
3904  *  call-seq:
3905  *     str.dump   => new_str
3906  *
3907  *  Produces a version of <i>str</i> with all nonprinting characters replaced by
3908  *  <code>\nnn</code> notation and all special characters escaped.
3909  */
3910
3911 VALUE
3912 rb_str_dump(VALUE str)
3913 {
3914     rb_encoding *enc0 = rb_enc_get(str);
3915     long len;
3916     const char *p, *pend;
3917     char *q, *qend;
3918     VALUE result;
3919
3920     len = 2;                    /* "" */
3921     p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
3922     while (p < pend) {
3923         unsigned char c = *p++;
3924         switch (c) {
3925           case '"':  case '\\':
3926           case '\n': case '\r':
3927           case '\t': case '\f':
3928           case '\013': case '\010': case '\007': case '\033':
3929             len += 2;
3930             break;
3931
3932           case '#':
3933             len += IS_EVSTR(p, pend) ? 2 : 1;
3934             break;
3935
3936           default:
3937             if (ISPRINT(c)) {
3938                 len++;
3939             }
3940             else {
3941                 len += 4;               /* \xNN */
3942             }
3943             break;
3944         }
3945     }
3946     if (!rb_enc_asciicompat(enc0)) {
3947         len += 19;              /* ".force_encoding('')" */
3948         len += strlen(enc0->name);
3949     }
3950
3951     result = rb_str_new5(str, 0, len);
3952     p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
3953     q = RSTRING_PTR(result); qend = q + len;
3954
3955     *q++ = '"';
3956     while (p < pend) {
3957         unsigned char c = *p++;
3958
3959         if (c == '"' || c == '\\') {
3960             *q++ = '\\';
3961             *q++ = c;
3962         }
3963         else if (c == '#') {
3964             if (IS_EVSTR(p, pend)) *q++ = '\\';
3965             *q++ = '#';
3966         }
3967         else if (c == '\n') {
3968             *q++ = '\\';
3969             *q++ = 'n';
3970         }
3971         else if (c == '\r') {
3972             *q++ = '\\';
3973             *q++ = 'r';
3974         }
3975         else if (c == '\t') {
3976             *q++ = '\\';
3977             *q++ = 't';
3978         }
3979         else if (c == '\f') {
3980             *q++ = '\\';
3981             *q++ = 'f';
3982         }
3983         else if (c == '\013') {
3984             *q++ = '\\';
3985             *q++ = 'v';
3986         }
3987         else if (c == '\010') {
3988             *q++ = '\\';
3989             *q++ = 'b';
3990         }
3991         else if (c == '\007') {
3992             *q++ = '\\';
3993             *q++ = 'a';
3994         }
3995         else if (c == '\033') {
3996             *q++ = '\\';
3997             *q++ = 'e';
3998         }
3999         else if (ISPRINT(c)) {
4000             *q++ = c;
4001         }
4002         else {
4003             *q++ = '\\';
4004             sprintf(q, "x%02X", c);
4005             q += 3;
4006         }
4007     }
4008     *q++ = '"';
4009     if (!rb_enc_asciicompat(enc0)) {
4010         sprintf(q, ".force_encoding(\"%s\")", enc0->name);
4011         enc0 = rb_ascii8bit_encoding();
4012     }
4013
4014     OBJ_INFECT(result, str);
4015     /* result from dump is ASCII */
4016     rb_enc_associate(result, enc0);
4017     return result;
4018 }
4019
4020
4021 /*
4022  *  call-seq:
4023  *     str.upcase!   => str or nil
4024  *
4025  *  Upcases the contents of <i>str</i>, returning <code>nil</code> if no changes
4026  *  were made.
4027  *  Note: case replacement is effective only in ASCII region.
4028  */
4029
4030 static VALUE
4031 rb_str_upcase_bang(VALUE str)
4032 {
4033     rb_encoding *enc;
4034     char *s, *send;
4035     int modify = 0;
4036     int cr = ENC_CODERANGE(str);
4037
4038     rb_str_modify(str);
4039     enc = STR_ENC_GET(str);
4040     s = RSTRING_PTR(str); send = RSTRING_END(str);
4041     while (s < send) {
4042         int c = rb_enc_codepoint(s, send, enc);
4043
4044         if (rb_enc_islower(c, enc)) {
4045             /* assuming toupper returns codepoint with same size */
4046             rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
4047             modify = 1;
4048         }
4049         s += rb_enc_codelen(c, enc);
4050     }
4051
4052     ENC_CODERANGE_SET(str, cr);
4053     if (modify) return str;
4054     return Qnil;
4055 }
4056
4057
4058 /*
4059  *  call-seq:
4060  *     str.upcase   => new_str
4061  *
4062  *  Returns a copy of <i>str</i> with all lowercase letters replaced with their
4063  *  uppercase counterparts. The operation is locale insensitive---only
4064  *  characters ``a'' to ``z'' are affected.
4065  *  Note: case replacement is effective only in ASCII region.
4066  *
4067  *     "hEllO".upcase   #=> "HELLO"
4068  */
4069
4070 static VALUE
4071 rb_str_upcase(VALUE str)
4072 {
4073     str = rb_str_dup(str);
4074     rb_str_upcase_bang(str);
4075     return str;
4076 }
4077
4078
4079 /*
4080  *  call-seq:
4081  *     str.downcase!   => str or nil
4082  *
4083  *  Downcases the contents of <i>str</i>, returning <code>nil</code> if no
4084  *  changes were made.
4085  *  Note: case replacement is effective only in ASCII region.
4086  */
4087
4088 static VALUE
4089 rb_str_downcase_bang(VALUE str)
4090 {
4091     rb_encoding *enc;
4092     char *s, *send;
4093     int modify = 0;
4094     int cr = ENC_CODERANGE(str);
4095
4096     rb_str_modify(str);
4097     enc = STR_ENC_GET(str);
4098     s = RSTRING_PTR(str); send = RSTRING_END(str);
4099     while (s < send) {
4100         int c = rb_enc_codepoint(s, send, enc);
4101
4102         if (rb_enc_isupper(c, enc)) {
4103             /* assuming toupper returns codepoint with same size */
4104             rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
4105             modify = 1;
4106         }
4107         s += rb_enc_codelen(c, enc);
4108     }
4109
4110     ENC_CODERANGE_SET(str, cr);
4111     if (modify) return str;
4112     return Qnil;
4113 }
4114
4115
4116 /*
4117  *  call-seq:
4118  *     str.downcase   => new_str
4119  *
4120  *  Returns a copy of <i>str</i> with all uppercase letters replaced with their
4121  *  lowercase counterparts. The operation is locale insensitive---only
4122  *  characters ``A'' to ``Z'' are affected.
4123  *  Note: case replacement is effective only in ASCII region.
4124  *
4125  *     "hEllO".downcase   #=> "hello"
4126  */
4127
4128 static VALUE
4129 rb_str_downcase(VALUE str)
4130 {
4131     str = rb_str_dup(str);
4132     rb_str_downcase_bang(str);
4133     return str;
4134 }
4135
4136
4137 /*
4138  *  call-seq:
4139  *     str.capitalize!   => str or nil
4140  *
4141  *  Modifies <i>str</i> by converting the first character to uppercase and the
4142  *  remainder to lowercase. Returns <code>nil</code> if no changes are made.
4143  *  Note: case conversion is effective only in ASCII region.
4144  *
4145  *     a = "hello"
4146  *     a.capitalize!   #=> "Hello"
4147  *     a               #=> "Hello"
4148  *     a.capitalize!   #=> nil
4149  */
4150
4151 static VALUE
4152 rb_str_capitalize_bang(VALUE str)
4153 {
4154     rb_encoding *enc;
4155     char *s, *send;
4156     int modify = 0;
4157     int c;
4158     int cr = ENC_CODERANGE(str);
4159
4160     rb_str_modify(str);
4161     enc = STR_ENC_GET(str);
4162     if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
4163     s = RSTRING_PTR(str); send = RSTRING_END(str);
4164
4165     c = rb_enc_codepoint(s, send, enc);
4166     if (rb_enc_islower(c, enc)) {
4167         rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
4168         modify = 1;
4169     }
4170     s += rb_enc_codelen(c, enc);
4171     while (s < send) {
4172         c = rb_enc_codepoint(s, send, enc);
4173         if (rb_enc_isupper(c, enc)) {
4174             rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
4175             modify = 1;
4176         }
4177         s += rb_enc_codelen(c, enc);
4178     }
4179
4180     ENC_CODERANGE_SET(str, cr);
4181     if (modify) return str;
4182     return Qnil;
4183 }
4184
4185
4186 /*
4187  *  call-seq:
4188  *     str.capitalize   => new_str
4189  *
4190  *  Returns a copy of <i>str</i> with the first character converted to uppercase
4191  *  and the remainder to lowercase.
4192  *  Note: case conversion is effective only in ASCII region.
4193  *
4194  *     "hello".capitalize    #=> "Hello"
4195  *     "HELLO".capitalize    #=> "Hello"
4196  *     "123ABC".capitalize   #=> "123abc"
4197  */
4198
4199 static VALUE
4200 rb_str_capitalize(VALUE str)
4201 {
4202     str = rb_str_dup(str);
4203     rb_str_capitalize_bang(str);
4204     return str;
4205 }
4206
4207
4208 /*
4209  *  call-seq:
4210 *     str.swapcase!   => str or nil
4211  *
4212  *  Equivalent to <code>String#swapcase</code>, but modifies the receiver in
4213  *  place, returning <i>str</i>, or <code>nil</code> if no changes were made.
4214  *  Note: case conversion is effective only in ASCII region.
4215  */
4216
4217 static VALUE
4218 rb_str_swapcase_bang(VALUE str)
4219 {
4220     rb_encoding *enc;
4221     char *s, *send;
4222     int modify = 0;
4223     int cr = ENC_CODERANGE(str);
4224
4225     rb_str_modify(str);
4226     enc = STR_ENC_GET(str);
4227     s = RSTRING_PTR(str); send = RSTRING_END(str);
4228     while (s < send) {
4229         int c = rb_enc_codepoint(s, send, enc);
4230
4231         if (rb_enc_isupper(c, enc)) {
4232             /* assuming toupper returns codepoint with same size */
4233             rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
4234             modify = 1;
4235         }
4236         else if (rb_enc_islower(c, enc)) {
4237             /* assuming toupper returns codepoint with same size */
4238             rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
4239             modify = 1;
4240         }
4241         s += rb_enc_codelen(c, enc);
4242     }
4243
4244     ENC_CODERANGE_SET(str, cr);
4245     if (modify) return str;
4246     return Qnil;
4247 }
4248
4249
4250 /*
4251  *  call-seq:
4252  *     str.swapcase   => new_str
4253  *
4254  *  Returns a copy of <i>str</i> with uppercase alphabetic characters converted
4255  *  to lowercase and lowercase characters converted to uppercase.
4256  *  Note: case conversion is effective only in ASCII region.
4257  *
4258  *     "Hello".swapcase          #=> "hELLO"
4259  *     "cYbEr_PuNk11".swapcase   #=> "CyBeR_pUnK11"
4260  */
4261
4262 static VALUE
4263 rb_str_swapcase(VALUE str)
4264 {
4265     str = rb_str_dup(str);
4266     rb_str_swapcase_bang(str);
4267     return str;
4268 }
4269
4270 typedef unsigned char *USTR;
4271
4272 struct tr {
4273     int gen, now, max;
4274     char *p, *pend;
4275 };
4276
4277 static int
4278 trnext(struct tr *t, rb_encoding *enc)
4279 {
4280     for (;;) {
4281         if (!t->gen) {
4282             if (t->p == t->pend) return -1;
4283             if (t->p < t->pend - 1 && *t->p == '\\') {
4284                 t->p++;
4285             }
4286             t->now = rb_enc_codepoint(t->p, t->pend, enc);
4287             t->p += rb_enc_codelen(t->now, enc);
4288             if (t->p < t->pend - 1 && *t->p == '-') {
4289                 t->p++;
4290                 if (t->p < t->pend) {
4291                     int c = rb_enc_codepoint(t->p, t->pend, enc);
4292                     t->p += rb_enc_codelen(c, enc);
4293                     if (t->now > c) continue;
4294                     t->gen = 1;
4295                     t->max = c;
4296                 }
4297             }
4298             return t->now;
4299         }
4300         else if (++t->now < t->max) {
4301             return t->now;
4302         }
4303         else {
4304             t->gen = 0;
4305             return t->max;
4306         }
4307     }
4308 }
4309
4310 static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
4311
4312 static VALUE
4313 tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
4314 {
4315     int trans[256];
4316     rb_encoding *enc, *e1, *e2;
4317     struct tr trsrc, trrepl;
4318     int cflag = 0;
4319     int c, c0, last = 0, modify = 0, i, l;
4320     char *s, *send;
4321     VALUE hash = 0;
4322     int singlebyte = single_byte_optimizable(str);
4323
4324     StringValue(src);
4325     StringValue(repl);
4326     if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
4327     if (RSTRING_LEN(repl) == 0) {
4328         return rb_str_delete_bang(1, &src, str);
4329     }
4330
4331     e1 = rb_enc_check(str, src);
4332     e2 = rb_enc_check(str, repl);
4333     if (e1 == e2) {
4334         enc = e1;
4335     }
4336     else {
4337         enc = rb_enc_check(src, repl);
4338     }
4339     trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
4340     if (RSTRING_LEN(src) > 1 &&
4341         rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
4342         trsrc.p + l < trsrc.pend) {
4343         cflag = 1;
4344         trsrc.p += l;
4345     }
4346     trrepl.p = RSTRING_PTR(repl);
4347     trrepl.pend = trrepl.p + RSTRING_LEN(repl);
4348     trsrc.gen = trrepl.gen = 0;
4349     trsrc.now = trrepl.now = 0;
4350     trsrc.max = trrepl.max = 0;
4351
4352     if (cflag) {
4353         for (i=0; i<256; i++) {
4354             trans[i] = 1;
4355         }
4356         while ((c = trnext(&trsrc, enc)) >= 0) {
4357             if (c < 256) {
4358                 trans[c] = -1;
4359             }
4360             else {
4361                 if (!hash) hash = rb_hash_new();
4362                 rb_hash_aset(hash, INT2NUM(c), Qtrue);
4363             }
4364         }
4365         while ((c = trnext(&trrepl, enc)) >= 0)
4366             /* retrieve last replacer */;
4367         last = trrepl.now;
4368         for (i=0; i<256; i++) {
4369             if (trans[i] >= 0) {
4370                 trans[i] = last;
4371             }
4372         }
4373     }
4374     else {
4375         int r;
4376
4377         for (i=0; i<256; i++) {
4378             trans[i] = -1;
4379         }
4380         while ((c = trnext(&trsrc, enc)) >= 0) {
4381             r = trnext(&trrepl, enc);
4382             if (r == -1) r = trrepl.now;
4383             if (c < 256) {
4384                 trans[c] = r;
4385                 if (r > 255) singlebyte = 0;
4386             }
4387             else {
4388                 if (!hash) hash = rb_hash_new();
4389                 rb_hash_aset(hash, INT2NUM(c), INT2NUM(r));
4390             }
4391         }
4392     }
4393
4394     rb_str_modify(str);
4395     s = RSTRING_PTR(str); send = RSTRING_END(str);
4396     if (sflag) {
4397         int clen, tlen, max = RSTRING_LEN(str);
4398         int offset, save = -1;
4399         char *buf = ALLOC_N(char, max), *t = buf;
4400
4401         while (s < send) {
4402             c0 = c = rb_enc_codepoint(s, send, enc);
4403             tlen = clen = rb_enc_codelen(c, enc);
4404
4405             s += clen;
4406             if (c < 256) {
4407                 c = trans[c];
4408             }
4409             else if (hash) {
4410                 VALUE tmp = rb_hash_lookup(hash, INT2NUM(c));
4411                 if (NIL_P(tmp)) {
4412                     if (cflag) c = last;
4413                     else c = -1;
4414                 }
4415                 else if (cflag) c = -1;
4416                 else c = NUM2INT(tmp);
4417             }
4418             else {
4419                 c = -1;
4420             }
4421             if (c >= 0) {
4422                 if (save == c) continue;
4423                 save = c;
4424                 tlen = rb_enc_codelen(c, enc);
4425                 modify = 1;
4426             }
4427             else {
4428                 save = -1;
4429                 c = c0;
4430             }
4431             while (t - buf + tlen >= max) {
4432                 offset = t - buf;
4433                 max *= 2;
4434                 REALLOC_N(buf, char, max);
4435                 t = buf + offset;
4436             }
4437             rb_enc_mbcput(c, t, enc);
4438             t += tlen;
4439         }
4440         *t = '\0';
4441         RSTRING(str)->as.heap.ptr = buf;
4442         RSTRING(str)->as.heap.len = t - buf;
4443         STR_SET_NOEMBED(str);
4444         RSTRING(str)->as.heap.aux.capa = max;
4445     }
4446     else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
4447         while (s < send) {
4448             c = (unsigned char)*s;
4449             if (trans[c] >= 0) {
4450                 if (!cflag) {
4451                     c = trans[c];
4452                     *s = c;
4453                     modify = 1;
4454                 }
4455                 else {
4456                     *s = last;
4457                     modify = 1;
4458                 }
4459             }
4460             s++;
4461         }
4462     }
4463     else {
4464         int clen, tlen, max = RSTRING_LEN(str) * 1.2;
4465         int offset;
4466         char *buf = ALLOC_N(char, max), *t = buf;
4467
4468         while (s < send) {
4469             c0 = c = rb_enc_codepoint(s, send, enc);
4470             tlen = clen = rb_enc_codelen(c, enc);
4471
4472             if (c < 256) {
4473                 c = trans[c];
4474             }
4475             else if (hash) {
4476                 VALUE tmp = rb_hash_lookup(hash, INT2NUM(c));
4477                 if (NIL_P(tmp)) {
4478                     if (cflag) c = last;
4479                     else c = -1;
4480                 }
4481                 else if (cflag) c = -1;
4482                 else c = NUM2INT(tmp);
4483             }
4484             else {
4485                 c = -1;
4486             }
4487             if (c >= 0) {
4488                 tlen = rb_enc_codelen(c, enc);
4489                 modify = 1;
4490             }
4491             else {
4492                 modify = 1;
4493                 c = c0;
4494             }
4495             while (t - buf + tlen >= max) {
4496                 offset = t - buf;
4497                 max *= 2;
4498                 REALLOC_N(buf, char, max);
4499                 t = buf + offset;
4500             }
4501             if (s != t) rb_enc_mbcput(c, t, enc);
4502             s += clen;
4503             t += tlen;
4504         }
4505         if (!STR_EMBED_P(str)) {
4506             xfree(RSTRING(str)->as.heap.ptr);
4507         }
4508         *t = '\0';
4509         RSTRING(str)->as.heap.ptr = buf;
4510         RSTRING(str)->as.heap.len = t - buf;
4511         STR_SET_NOEMBED(str);
4512         RSTRING(str)->as.heap.aux.capa = max;
4513     }
4514
4515     if (modify) {
4516         rb_enc_associate(str, enc);
4517         return str;
4518     }
4519     return Qnil;
4520 }
4521
4522
4523 /*
4524  *  call-seq:
4525  *     str.tr!(from_str, to_str)   => str or nil
4526  *
4527  *  Translates <i>str</i> in place, using the same rules as
4528  *  <code>String#tr</code>. Returns <i>str</i>, or <code>nil</code> if no
4529  *  changes were made.
4530  */
4531
4532 static VALUE
4533 rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
4534 {
4535     return tr_trans(str, src, repl, 0);
4536 }
4537
4538
4539 /*
4540  *  call-seq:
4541  *     str.tr(from_str, to_str)   => new_str
4542  *
4543  *  Returns a copy of <i>str</i> with the characters in <i>from_str</i> replaced
4544  *  by the corresponding characters in <i>to_str</i>. If <i>to_str</i> is
4545  *  shorter than <i>from_str</i>, it is padded with its last character. Both
4546  *  strings may use the c1--c2 notation to denote ranges of characters, and
4547  *  <i>from_str</i> may start with a <code>^</code>, which denotes all
4548  *  characters except those listed.
4549  *
4550  *     "hello".tr('aeiou', '*')    #=> "h*ll*"
4551  *     "hello".tr('^aeiou', '*')   #=> "*e**o"
4552  *     "hello".tr('el', 'ip')      #=> "hippo"
4553  *     "hello".tr('a-y', 'b-z')    #=> "ifmmp"
4554  */
4555
4556 static VALUE
4557 rb_str_tr(VALUE str, VALUE src, VALUE repl)
4558 {
4559     str = rb_str_dup(str);
4560     tr_trans(str, src, repl, 0);
4561     return str;
4562 }
4563
4564 static void
4565 tr_setup_table(VALUE str, char stable[256], int first,
4566                VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
4567 {
4568     char buf[256];
4569     struct tr tr;
4570     int c, l;
4571     VALUE table = 0, ptable = 0;
4572     int i, cflag = 0;
4573
4574     tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
4575     tr.gen = tr.now = tr.max = 0;
4576
4577     if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
4578         cflag = 1;
4579         tr.p += l;
4580     }
4581     if (first) {
4582         for (i=0; i<256; i++) {
4583             stable[i] = 1;
4584         }
4585     }
4586     for (i=0; i<256; i++) {
4587         buf[i] = cflag;
4588     }
4589
4590     while ((c = trnext(&tr, enc)) >= 0) {
4591         if (c < 256) {
4592             buf[c & 0xff] = !cflag;
4593         }
4594         else {
4595             VALUE key = INT2NUM(c);
4596
4597             if (!table) {
4598                 table = rb_hash_new();
4599                 if (cflag) {
4600                     ptable = *ctablep;
4601                     *ctablep = table;
4602                 }
4603                 else {
4604                     ptable = *tablep;
4605                     *tablep = table;
4606                 }
4607             }
4608             if (!ptable || !NIL_P(rb_hash_aref(ptable, key))) {
4609                 rb_hash_aset(table, key, Qtrue);
4610             }
4611         }
4612     }
4613     for (i=0; i<256; i++) {
4614         stable[i] = stable[i] && buf[i];
4615     }
4616 }
4617
4618
4619 static int
4620 tr_find(int c, char table[256], VALUE del, VALUE nodel)
4621 {
4622     if (c < 256) {
4623         return table[c] ? Qtrue : Qfalse;
4624     }
4625     else {
4626         VALUE v = INT2NUM(c);
4627
4628         if (del && !NIL_P(rb_hash_lookup(del, v))) {
4629             if (!nodel || NIL_P(rb_hash_lookup(nodel, v))) {
4630                 return Qtrue;
4631             }
4632         }
4633         return Qfalse;
4634     }
4635 }
4636
4637 /*
4638  *  call-seq:
4639  *     str.delete!([other_str]+)   => str or nil
4640  *
4641  *  Performs a <code>delete</code> operation in place, returning <i>str</i>, or
4642  *  <code>nil</code> if <i>str</i> was not modified.
4643  */
4644
4645 static VALUE
4646 rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
4647 {
4648     char squeez[256];
4649     rb_encoding *enc = 0;
4650     char *s, *send, *t;
4651     VALUE del = 0, nodel = 0;
4652     int modify = 0;
4653     int i;
4654     int cr;
4655
4656     if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
4657     cr = ENC_CODERANGE(str);
4658     if (argc < 1) {
4659         rb_raise(rb_eArgError, "wrong number of arguments");
4660     }
4661     for (i=0; i<argc; i++) {
4662         VALUE s = argv[i];
4663
4664         StringValue(s);
4665         enc = rb_enc_check(str, s);
4666         tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
4667     }
4668
4669     rb_str_modify(str);
4670     s = t = RSTRING_PTR(str);
4671     if (!s || RSTRING_LEN(str) == 0) return Qnil;
4672     send = RSTRING_END(str);
4673     while (s < send) {
4674         int c = rb_enc_codepoint(s, send, enc);
4675         int clen = rb_enc_codelen(c, enc);
4676
4677         if (tr_find(c, squeez, del, nodel)) {
4678             modify = 1;
4679         }
4680         else {
4681             if (t != s) rb_enc_mbcput(c, t, enc);
4682             t += clen;
4683         }
4684         s += clen;
4685     }
4686     *t = '\0';
4687     STR_SET_LEN(str, t - RSTRING_PTR(str));
4688
4689     ENC_CODERANGE_SET(str, cr);
4690     if (modify) return str;
4691     return Qnil;
4692 }
4693
4694
4695 /*
4696  *  call-seq:
4697  *     str.delete([other_str]+)   => new_str
4698  *
4699  *  Returns a copy of <i>str</i> with all characters in the intersection of its
4700  *  arguments deleted. Uses the same rules for building the set of characters as
4701  *  <code>String#count</code>.
4702  *
4703  *     "hello".delete "l","lo"        #=> "heo"
4704  *     "hello".delete "lo"            #=> "he"
4705  *     "hello".delete "aeiou", "^e"   #=> "hell"
4706  *     "hello".delete "ej-m"          #=> "ho"
4707  */
4708
4709 static VALUE
4710 rb_str_delete(int argc, VALUE *argv, VALUE str)
4711 {
4712     str = rb_str_dup(str);
4713     rb_str_delete_bang(argc, argv, str);
4714     return str;
4715 }
4716
4717
4718 /*
4719  *  call-seq:
4720  *     str.squeeze!([other_str]*)   => str or nil
4721  *
4722  *  Squeezes <i>str</i> in place, returning either <i>str</i>, or
4723  *  <code>nil</code> if no changes were made.
4724  */
4725
4726 static VALUE
4727 rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
4728 {
4729     char squeez[256];
4730     rb_encoding *enc = 0;
4731     VALUE del = 0, nodel = 0;
4732     char *s, *send, *t;
4733     int save, modify = 0;
4734     int i;
4735
4736     if (argc == 0) {
4737         enc = STR_ENC_GET(str);
4738     }
4739     else {
4740         for (i=0; i<argc; i++) {
4741             VALUE s = argv[i];
4742
4743             StringValue(s);
4744             enc = rb_enc_check(str, s);
4745             tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
4746         }
4747     }
4748
4749     rb_str_modify(str);
4750     s = t = RSTRING_PTR(str);
4751     if (!s || RSTRING_LEN(str) == 0) return Qnil;
4752     send = RSTRING_END(str);
4753     save = -1;
4754     while (s < send) {
4755         int c = rb_enc_codepoint(s, send, enc);
4756         int clen = rb_enc_codelen(c, enc);
4757
4758         if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
4759             if (t != s) rb_enc_mbcput(c, t, enc);
4760             save = c;
4761             t += clen;
4762         }
4763         s += clen;
4764     }
4765     *t = '\0';
4766     if (t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
4767         STR_SET_LEN(str, t - RSTRING_PTR(str));
4768         modify = 1;
4769     }
4770
4771     if (modify) return str;
4772     return Qnil;
4773 }
4774
4775
4776 /*
4777  *  call-seq:
4778  *     str.squeeze([other_str]*)    => new_str
4779  *
4780  *  Builds a set of characters from the <i>other_str</i> parameter(s) using the
4781  *  procedure described for <code>String#count</code>. Returns a new string
4782  *  where runs of the same character that occur in this set are replaced by a
4783  *  single character. If no arguments are given, all runs of identical
4784  *  characters are replaced by a single character.
4785  *
4786  *     "yellow moon".squeeze                  #=> "yelow mon"
4787  *     "  now   is  the".squeeze(" ")         #=> " now is the"
4788  *     "putters shoot balls".squeeze("m-z")   #=> "puters shot balls"
4789  */
4790
4791 static VALUE
4792 rb_str_squeeze(int argc, VALUE *argv, VALUE str)
4793 {
4794     str = rb_str_dup(str);
4795     rb_str_squeeze_bang(argc, argv, str);
4796     return str;
4797 }
4798
4799
4800 /*
4801  *  call-seq:
4802  *     str.tr_s!(from_str, to_str)   => str or nil
4803  *
4804  *  Performs <code>String#tr_s</code> processing on <i>str</i> in place,
4805  *  returning <i>str</i>, or <code>nil</code> if no changes were made.
4806  */
4807
4808 static VALUE
4809 rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
4810 {
4811     return tr_trans(str, src, repl, 1);
4812 }
4813
4814
4815 /*
4816  *  call-seq:
4817  *     str.tr_s(from_str, to_str)   => new_str
4818  *
4819  *  Processes a copy of <i>str</i> as described under <code>String#tr</code>,
4820  *  then removes duplicate characters in regions that were affected by the
4821  *  translation.
4822  *
4823  *     "hello".tr_s('l', 'r')     #=> "hero"
4824  *     "hello".tr_s('el', '*')    #=> "h*o"
4825  *     "hello".tr_s('el', 'hx')   #=> "hhxo"
4826  */
4827
4828 static VALUE
4829 rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
4830 {
4831     str = rb_str_dup(str);
4832     tr_trans(str, src, repl, 1);
4833     return str;
4834 }
4835
4836
4837 /*
4838  *  call-seq:
4839  *     str.count([other_str]+)   => fixnum
4840  *
4841  *  Each <i>other_str</i> parameter defines a set of characters to count.  The
4842  *  intersection of these sets defines the characters to count in
4843  *  <i>str</i>. Any <i>other_str</i> that starts with a caret (^) is
4844  *  negated. The sequence c1--c2 means all characters between c1 and c2.
4845  *
4846  *     a = "hello world"
4847  *     a.count "lo"            #=> 5
4848  *     a.count "lo", "o"       #=> 2
4849  *     a.count "hello", "^l"   #=> 4
4850  *     a.count "ej-m"          #=> 4
4851  */
4852
4853 static VALUE
4854 rb_str_count(int argc, VALUE *argv, VALUE str)
4855 {
4856     char table[256];
4857     rb_encoding *enc = 0;
4858     VALUE del = 0, nodel = 0;
4859     char *s, *send;
4860     int i;
4861
4862     if (argc < 1) {
4863         rb_raise(rb_eArgError, "wrong number of arguments");
4864     }
4865     for (i=0; i<argc; i++) {
4866         VALUE s = argv[i];
4867
4868         StringValue(s);
4869         enc = rb_enc_check(str, s);
4870         tr_setup_table(s, table,i==0, &del, &nodel, enc);
4871     }
4872
4873     s = RSTRING_PTR(str);
4874     if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
4875     send = RSTRING_END(str);
4876     i = 0;
4877     while (s < send) {
4878         int c = rb_enc_codepoint(s, send, enc);
4879         int clen = rb_enc_codelen(c, enc);
4880
4881         if (tr_find(c, table, del, nodel)) {
4882             i++;
4883         }
4884         s += clen;
4885     }
4886     return INT2NUM(i);
4887 }
4888
4889
4890 /*
4891  *  call-seq:
4892  *     str.split(pattern=$;, [limit])   => anArray
4893  *
4894  *  Divides <i>str</i> into substrings based on a delimiter, returning an array
4895  *  of these substrings.
4896  *
4897  *  If <i>pattern</i> is a <code>String</code>, then its contents are used as
4898  *  the delimiter when splitting <i>str</i>. If <i>pattern</i> is a single
4899  *  space, <i>str</i> is split on whitespace, with leading whitespace and runs
4900  *  of contiguous whitespace characters ignored.
4901  *
4902  *  If <i>pattern</i> is a <code>Regexp</code>, <i>str</i> is divided where the
4903  *  pattern matches. Whenever the pattern matches a zero-length string,
4904  *  <i>str</i> is split into individual characters. If <i>pattern</i> contains
4905  *  groups, the respective matches will be returned in the array as well.
4906  *
4907  *  If <i>pattern</i> is omitted, the value of <code>$;</code> is used.  If
4908  *  <code>$;</code> is <code>nil</code> (which is the default), <i>str</i> is
4909  *  split on whitespace as if ` ' were specified.
4910  *
4911  *  If the <i>limit</i> parameter is omitted, trailing null fields are
4912  *  suppressed. If <i>limit</i> is a positive number, at most that number of
4913  *  fields will be returned (if <i>limit</i> is <code>1</code>, the entire
4914  *  string is returned as the only entry in an array). If negative, there is no
4915  *  limit to the number of fields returned, and trailing null fields are not
4916  *  suppressed.
4917  *
4918  *     " now's  the time".split        #=> ["now's", "the", "time"]
4919  *     " now's  the time".split(' ')   #=> ["now's", "the", "time"]
4920  *     " now's  the time".split(/ /)   #=> ["", "now's", "", "the", "time"]
4921  *     "1, 2.34,56, 7".split(%r{,\s*}) #=> ["1", "2.34", "56", "7"]
4922  *     "hello".split(//)               #=> ["h", "e", "l", "l", "o"]
4923  *     "hello".split(//, 3)            #=> ["h", "e", "llo"]
4924  *     "hi mom".split(%r{\s*})         #=> ["h", "i", "m", "o", "m"]
4925  *
4926  *     "mellow yellow".split("ello")   #=> ["m", "w y", "w"]
4927  *     "1,2,,3,4,,".split(',')         #=> ["1", "2", "", "3", "4"]
4928  *     "1,2,,3,4,,".split(',', 4)      #=> ["1", "2", "", "3,4,,"]
4929  *     "1,2,,3,4,,".split(',', -4)     #=> ["1", "2", "", "3", "4", "", ""]
4930  */
4931
4932 static VALUE
4933 rb_str_split_m(int argc, VALUE *argv, VALUE str)
4934 {
4935     rb_encoding *enc;
4936     VALUE spat;
4937     VALUE limit;
4938     int awk_split = Qfalse;
4939     long beg, end, i = 0;
4940     int lim = 0;
4941     VALUE result, tmp;
4942
4943     if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
4944         lim = NUM2INT(limit);
4945         if (lim <= 0) limit = Qnil;
4946         else if (lim == 1) {
4947             if (RSTRING_LEN(str) == 0)
4948                 return rb_ary_new2(0);
4949             return rb_ary_new3(1, str);
4950         }
4951         i = 1;
4952     }
4953
4954     enc = STR_ENC_GET(str);
4955     if (NIL_P(spat)) {
4956         if (!NIL_P(rb_fs)) {
4957             spat = rb_fs;
4958             goto fs_set;
4959         }
4960         awk_split = Qtrue;
4961     }
4962     else {
4963       fs_set:
4964         if (TYPE(spat) == T_STRING) {
4965             rb_encoding *enc2 = STR_ENC_GET(spat);
4966
4967             if (rb_enc_mbminlen(enc2) == 1) {
4968                 if (RSTRING_LEN(spat) == 1 && RSTRING_PTR(spat)[0] == ' '){
4969                     awk_split = Qtrue;
4970                 }
4971             }
4972             else {
4973                 int l;
4974                 if (rb_enc_ascget(RSTRING_PTR(spat), RSTRING_END(spat), &l, enc2) == ' ' &&
4975                     RSTRING_LEN(spat) == l) {
4976                     awk_split = Qtrue;
4977                 }
4978             }
4979             if (!awk_split) {
4980                 spat = rb_reg_regcomp(rb_reg_quote(spat));
4981             }
4982         }
4983         else {
4984             spat = get_pat(spat, 1);
4985         }
4986     }
4987
4988     result = rb_ary_new();
4989     beg = 0;
4990     if (awk_split) {
4991         char *ptr = RSTRING_PTR(str);
4992         char *eptr = RSTRING_END(str);
4993         char *bptr = ptr;
4994         int skip = 1;
4995         int c;
4996
4997         end = beg;
4998         while (ptr < eptr) {
4999             c = rb_enc_codepoint(ptr, eptr, enc);
5000             ptr += rb_enc_mbclen(ptr, eptr, enc);
5001             if (skip) {
5002                 if (rb_enc_isspace(c, enc)) {
5003                     beg = ptr - bptr;
5004                 }
5005                 else {
5006                     end = ptr - bptr;
5007                     skip = 0;
5008                     if (!NIL_P(limit) && lim <= i) break;
5009                 }
5010             }
5011             else {
5012                 if (rb_enc_isspace(c, enc)) {
5013                     rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
5014                     skip = 1;
5015                     beg = ptr - bptr;
5016                     if (!NIL_P(limit)) ++i;
5017                 }
5018                 else {
5019                     end = ptr - bptr;
5020                 }
5021             }
5022         }
5023     }
5024     else {
5025         long start = beg;
5026         long idx;
5027         int last_null = 0;
5028         struct re_registers *regs;
5029
5030         while ((end = rb_reg_search(spat, str, start, 0)) >= 0) {
5031             regs = RMATCH_REGS(rb_backref_get());
5032             if (start == end && BEG(0) == END(0)) {
5033                 if (!RSTRING_PTR(str)) {
5034                     rb_ary_push(result, rb_str_new("", 0));
5035                     break;
5036                 }
5037                 else if (last_null == 1) {
5038                     rb_ary_push(result, rb_str_subseq(str, beg,
5039                                                       rb_enc_mbclen(RSTRING_PTR(str)+beg,
5040                                                                     RSTRING_END(str),
5041                                                                     enc)));
5042                     beg = start;
5043                 }
5044                 else {
5045                     if (RSTRING_PTR(str)+start == RSTRING_END(str))
5046                         start++;
5047                     else
5048                         start += rb_enc_mbclen(RSTRING_PTR(str)+start,RSTRING_END(str),enc);
5049                     last_null = 1;
5050                     continue;
5051                 }
5052             }
5053             else {
5054                 rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
5055                 beg = start = END(0);
5056             }
5057             last_null = 0;
5058
5059             for (idx=1; idx < regs->num_regs; idx++) {
5060                 if (BEG(idx) == -1) continue;
5061                 if (BEG(idx) == END(idx))
5062                     tmp = rb_str_new5(str, 0, 0);
5063                 else
5064                     tmp = rb_str_subseq(str, BEG(idx), END(idx)-BEG(idx));
5065                 rb_ary_push(result, tmp);
5066             }
5067             if (!NIL_P(limit) && lim <= ++i) break;
5068         }
5069     }
5070     if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
5071         if (RSTRING_LEN(str) == beg)
5072             tmp = rb_str_new5(str, 0, 0);
5073         else
5074             tmp = rb_str_subseq(str, beg, RSTRING_LEN(str)-beg);
5075         rb_ary_push(result, tmp);
5076     }
5077     if (NIL_P(limit) && lim == 0) {
5078         while (RARRAY_LEN(result) > 0 &&
5079                RSTRING_LEN(RARRAY_PTR(result)[RARRAY_LEN(result)-1]) == 0)
5080             rb_ary_pop(result);
5081     }
5082
5083     return result;
5084 }
5085
5086 VALUE
5087 rb_str_split(VALUE str, const char *sep0)
5088 {
5089     VALUE sep;
5090
5091     StringValue(str);
5092     sep = rb_str_new2(sep0);
5093     return rb_str_split_m(1, &sep, str);
5094 }
5095
5096
5097 /*
5098  *  Document-method: lines
5099  *  call-seq:
5100  *     str.lines(separator=$/)   => anEnumerator
5101  *     str.lines(separator=$/) {|substr| block }        => str
5102  *
5103  *  Returns an enumerator that gives each line in the string.  If a block is
5104  *  given, it iterates over each line in the string.
5105  *
5106  *     "foo\nbar\n".lines.to_a   #=> ["foo\n", "bar\n"]
5107  *     "foo\nb ar".lines.sort    #=> ["b ar", "foo\n"]
5108  */
5109
5110 /*
5111  *  Document-method: each_line
5112  *  call-seq:
5113  *     str.each_line(separator=$/) {|substr| block }   => str
5114  *
5115  *  Splits <i>str</i> using the supplied parameter as the record separator
5116  *  (<code>$/</code> by default), passing each substring in turn to the supplied
5117  *  block. If a zero-length record separator is supplied, the string is split
5118  *  into paragraphs delimited by multiple successive newlines.
5119  *
5120  *     print "Example one\n"
5121  *     "hello\nworld".each {|s| p s}
5122  *     print "Example two\n"
5123  *     "hello\nworld".each('l') {|s| p s}
5124  *     print "Example three\n"
5125  *     "hello\n\n\nworld".each('') {|s| p s}
5126  *
5127  *  <em>produces:</em>
5128  *
5129  *     Example one
5130  *     "hello\n"
5131  *     "world"
5132  *     Example two
5133  *     "hel"
5134  *     "l"
5135  *     "o\nworl"
5136  *     "d"
5137  *     Example three
5138  *     "hello\n\n\n"
5139  *     "world"
5140  */
5141
5142 static VALUE
5143 rb_str_each_line(int argc, VALUE *argv, VALUE str)
5144 {
5145     rb_encoding *enc;
5146     VALUE rs;
5147     int newline;
5148     char *p, *pend, *s, *ptr;
5149     long len, rslen;
5150     VALUE line;
5151     int n;
5152     VALUE orig = str;
5153
5154     if (argc == 0) {
5155         rs = rb_rs;
5156     }
5157     else {
5158         rb_scan_args(argc, argv, "01", &rs);
5159     }
5160     RETURN_ENUMERATOR(str, argc, argv);
5161     if (NIL_P(rs)) {
5162         rb_yield(str);
5163         return orig;
5164     }
5165     str = rb_str_new4(str);
5166     ptr = p = s = RSTRING_PTR(str);
5167     pend = p + RSTRING_LEN(str);
5168     len = RSTRING_LEN(str);
5169     StringValue(rs);
5170     if (rs == rb_default_rs) {
5171         enc = rb_enc_get(str);
5172         while (p < pend) {
5173             char *p0;
5174
5175             p = memchr(p, '\n', pend - p);
5176             if (!p) break;
5177             p0 = rb_enc_left_char_head(s, p, enc);
5178             if (!rb_enc_is_newline(p0, pend, enc)) {
5179                 p++;
5180                 continue;
5181             }
5182             p = p0 + rb_enc_mbclen(p0, pend, enc);
5183             line = rb_str_new5(str, s, p - s);
5184             OBJ_INFECT(line, str);
5185             rb_enc_cr_str_copy_for_substr(line, str);
5186             rb_yield(line);
5187             str_mod_check(str, ptr, len);
5188             s = p;
5189         }
5190         goto finish;
5191     }
5192
5193     enc = rb_enc_check(str, rs);
5194     rslen = RSTRING_LEN(rs);
5195     if (rslen == 0) {
5196         newline = '\n';
5197     }
5198     else {
5199         newline = rb_enc_codepoint(RSTRING_PTR(rs), RSTRING_END(rs), enc);
5200     }
5201
5202     while (p < pend) {
5203         int c = rb_enc_codepoint(p, pend, enc);
5204
5205       again:
5206         n = rb_enc_codelen(c, enc);
5207         if (rslen == 0 && c == newline) {
5208             p += n;
5209             if (p < pend && (c = rb_enc_codepoint(p, pend, enc)) != newline) {
5210                 goto again;
5211             }
5212             while (p < pend && rb_enc_codepoint(p, pend, enc) == newline) {
5213                 p += n;
5214             }
5215             p -= n;
5216         }
5217         if (c == newline &&
5218             (rslen <= 1 || memcmp(RSTRING_PTR(rs), p, rslen) == 0)) {
5219             line = rb_str_new5(str, s, p - s + (rslen ? rslen : n));
5220             OBJ_INFECT(line, str);
5221             rb_enc_cr_str_copy_for_substr(line, str);
5222             rb_yield(line);
5223             str_mod_check(str, ptr, len);
5224             s = p + (rslen ? rslen : n);
5225         }
5226         p += n;
5227     }
5228
5229   finish:
5230     if (s != pend) {
5231         line = rb_str_new5(str, s, pend - s);
5232         OBJ_INFECT(line, str);
5233         rb_enc_cr_str_copy_for_substr(line, str);
5234         rb_yield(line);
5235     }
5236
5237     return orig;
5238 }
5239
5240
5241 /*
5242  *  Document-method: bytes
5243  *  call-seq:
5244  *     str.bytes   => anEnumerator
5245  *     str.bytes {|fixnum| block }    => str
5246  *
5247  *  Returns an enumerator that gives each byte in the string.  If a block is
5248  *  given, it iterates over each byte in the string.
5249  *
5250  *     "hello".bytes.to_a        #=> [104, 101, 108, 108, 111]
5251  */
5252
5253 /*
5254  *  Document-method: each_byte
5255  *  call-seq:
5256  *     str.each_byte {|fixnum| block }    => str
5257  *
5258  *  Passes each byte in <i>str</i> to the given block.
5259  *
5260  *     "hello".each_byte {|c| print c, ' ' }
5261  *
5262  *  <em>produces:</em>
5263  *
5264  *     104 101 108 108 111
5265  */
5266
5267 static VALUE
5268 rb_str_each_byte(VALUE str)
5269 {
5270     long i;
5271
5272     RETURN_ENUMERATOR(str, 0, 0);
5273     for (i=0; i<RSTRING_LEN(str); i++) {
5274         rb_yield(INT2FIX(RSTRING_PTR(str)[i] & 0xff));
5275     }
5276     return str;
5277 }
5278
5279
5280 /*
5281  *  Document-method: chars
5282  *  call-seq:
5283  *     str.chars                   => anEnumerator
5284  *     str.chars {|substr| block } => str
5285  *
5286  *  Returns an enumerator that gives each character in the string.
5287  *  If a block is given, it iterates over each character in the string.
5288  *
5289  *     "foo".chars.to_a   #=> ["f","o","o"]
5290  */
5291
5292 /*
5293  *  Document-method: each_char
5294  *  call-seq:
5295  *     str.each_char {|cstr| block }    => str
5296  *
5297  *  Passes each character in <i>str</i> to the given block.
5298  *
5299  *     "hello".each_char {|c| print c, ' ' }
5300  *
5301  *  <em>produces:</em>
5302  *
5303  *     h e l l o
5304  */
5305
5306 static VALUE
5307 rb_str_each_char(VALUE str)
5308 {
5309     int i, len, n;
5310     const char *ptr;
5311     rb_encoding *enc;
5312
5313     RETURN_ENUMERATOR(str, 0, 0);
5314     str = rb_str_new4(str);
5315     ptr = RSTRING_PTR(str);
5316     len = RSTRING_LEN(str);
5317     enc = rb_enc_get(str);
5318     for (i = 0; i < len; i += n) {
5319         n = rb_enc_mbclen(ptr + i, ptr + len, enc);
5320         rb_yield(rb_str_subseq(str, i, n));
5321     }
5322     return str;
5323 }
5324
5325 static long
5326 chopped_length(VALUE str)
5327 {
5328     rb_encoding *enc = STR_ENC_GET(str);
5329     const char *p, *p2, *beg, *end;
5330
5331     beg = RSTRING_PTR(str);
5332     end = beg + RSTRING_LEN(str);
5333     if (beg > end) return 0;
5334     p = rb_enc_prev_char(beg, end, enc);
5335     if (!p) return 0;
5336     if (p > beg && rb_enc_codepoint(p, end, enc) == '\n') {
5337         p2 = rb_enc_prev_char(beg, p, enc);
5338         if (p2 && rb_enc_codepoint(p2, end, enc) == '\r') p = p2;
5339     }
5340     return p - beg;
5341 }
5342
5343 /*
5344  *  call-seq:
5345  *     str.chop!   => str or nil
5346  *
5347  *  Processes <i>str</i> as for <code>String#chop</code>, returning <i>str</i>,
5348  *  or <code>nil</code> if <i>str</i> is the empty string.  See also
5349  *  <code>String#chomp!</code>.
5350  */
5351
5352 static VALUE
5353 rb_str_chop_bang(VALUE str)
5354 {
5355     if (RSTRING_LEN(str) > 0) {
5356         long len;
5357         rb_str_modify(str);
5358         len = chopped_length(str);
5359         STR_SET_LEN(str, len);
5360         RSTRING_PTR(str)[len] = '\0';
5361         return str;
5362     }
5363     return Qnil;
5364 }
5365
5366
5367 /*
5368  *  call-seq:
5369  *     str.chop   => new_str
5370  *
5371  *  Returns a new <code>String</code> with the last character removed.  If the
5372  *  string ends with <code>\r\n</code>, both characters are removed. Applying
5373  *  <code>chop</code> to an empty string returns an empty
5374  *  string. <code>String#chomp</code> is often a safer alternative, as it leaves
5375  *  the string unchanged if it doesn't end in a record separator.
5376  *
5377  *     "string\r\n".chop   #=> "string"
5378  *     "string\n\r".chop   #=> "string\n"
5379  *     "string\n".chop     #=> "string"
5380  *     "string".chop       #=> "strin"
5381  *     "x".chop.chop       #=> ""
5382  */
5383
5384 static VALUE
5385 rb_str_chop(VALUE str)
5386 {
5387     VALUE str2 = rb_str_new5(str, RSTRING_PTR(str), chopped_length(str));
5388     rb_enc_cr_str_copy_for_substr(str2, str);
5389     OBJ_INFECT(str2, str);
5390     return str2;
5391 }
5392
5393
5394 /*
5395  *  call-seq:
5396  *     str.chomp!(separator=$/)   => str or nil
5397  *
5398  *  Modifies <i>str</i> in place as described for <code>String#chomp</code>,
5399  *  returning <i>str</i>, or <code>nil</code> if no modifications were made.
5400  */
5401
5402 static VALUE
5403 rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
5404 {
5405     rb_encoding *enc;
5406     VALUE rs;
5407     int newline;
5408     char *p, *pp, *e;
5409     long len, rslen;
5410
5411     len = RSTRING_LEN(str);
5412     if (len == 0) return Qnil;
5413     p = RSTRING_PTR(str);
5414     e = p + len;
5415     if (argc == 0) {
5416         rs = rb_rs;
5417         if (rs == rb_default_rs) {
5418           smart_chomp:
5419             rb_str_modify(str);
5420             enc = rb_enc_get(str);
5421             if (rb_enc_mbminlen(enc) > 1) {
5422                 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), enc);
5423                 if (rb_enc_is_newline(pp, e, enc)) {
5424                     e = pp;
5425                 }
5426                 pp = e - rb_enc_mbminlen(enc);
5427                 if (pp >= p) {
5428                     pp = rb_enc_left_char_head(p, pp, enc);
5429                     if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
5430                         e = pp;
5431                     }
5432                 }
5433                 if (e == RSTRING_END(str)) {
5434                     return Qnil;
5435                 }
5436                 len = e - RSTRING_PTR(str);
5437                 STR_SET_LEN(str, len);
5438             }
5439             else {
5440                 if (RSTRING_PTR(str)[len-1] == '\n') {
5441                     STR_DEC_LEN(str);
5442                     if (RSTRING_LEN(str) > 0 &&
5443                         RSTRING_PTR(str)[RSTRING_LEN(str)-1] == '\r') {
5444                         STR_DEC_LEN(str);
5445                     }
5446                 }
5447                 else if (RSTRING_PTR(str)[len-1] == '\r') {
5448                     STR_DEC_LEN(str);
5449                 }
5450                 else {
5451                     return Qnil;
5452                 }
5453             }
5454             RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
5455             return str;
5456         }
5457     }
5458     else {
5459         rb_scan_args(argc, argv, "01", &rs);
5460     }
5461     if (NIL_P(rs)) return Qnil;
5462     StringValue(rs);
5463     rslen = RSTRING_LEN(rs);
5464     if (rslen == 0) {
5465         while (len>0 && p[len-1] == '\n') {
5466             len--;
5467             if (len>0 && p[len-1] == '\r')
5468                 len--;
5469         }
5470         if (len < RSTRING_LEN(str)) {
5471             rb_str_modify(str);
5472             STR_SET_LEN(str, len);
5473             RSTRING_PTR(str)[len] = '\0';
5474             return str;
5475         }
5476         return Qnil;
5477     }
5478     if (rslen > len) return Qnil;
5479     newline = RSTRING_PTR(rs)[rslen-1];
5480     if (rslen == 1 && newline == '\n')
5481         goto smart_chomp;
5482
5483     enc = rb_enc_check(str, rs);
5484     if (is_broken_string(rs)) {
5485         return Qnil;
5486     }
5487     pp = e - rslen;
5488     if (p[len-1] == newline &&
5489         (rslen <= 1 ||
5490          memcmp(RSTRING_PTR(rs), pp, rslen) == 0)) {
5491         if (rb_enc_left_char_head(p, pp, enc) != pp)
5492             return Qnil;
5493         rb_str_modify(str);
5494         STR_SET_LEN(str, RSTRING_LEN(str) - rslen);
5495         RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
5496         return str;
5497     }
5498     return Qnil;
5499 }
5500
5501
5502 /*
5503  *  call-seq:
5504  *     str.chomp(separator=$/)   => new_str
5505  *
5506  *  Returns a new <code>String</code> with the given record separator removed
5507  *  from the end of <i>str</i> (if present). If <code>$/</code> has not been
5508  *  changed from the default Ruby record separator, then <code>chomp</code> also
5509  *  removes carriage return characters (that is it will remove <code>\n</code>,
5510  *  <code>\r</code>, and <code>\r\n</code>).
5511  *
5512  *     "hello".chomp            #=> "hello"
5513  *     "hello\n".chomp          #=> "hello"
5514  *     "hello\r\n".chomp        #=> "hello"
5515  *     "hello\n\r".chomp        #=> "hello\n"
5516  *     "hello\r".chomp          #=> "hello"
5517  *     "hello \n there".chomp   #=> "hello \n there"
5518  *     "hello".chomp("llo")     #=> "he"
5519  */
5520
5521 static VALUE
5522 rb_str_chomp(int argc, VALUE *argv, VALUE str)
5523 {
5524     str = rb_str_dup(str);
5525     rb_str_chomp_bang(argc, argv, str);
5526     return str;
5527 }
5528
5529 /*
5530  *  call-seq:
5531  *     str.lstrip!   => self or nil
5532  *
5533  *  Removes leading whitespace from <i>str</i>, returning <code>nil</code> if no
5534  *  change was made. See also <code>String#rstrip!</code> and
5535  *  <code>String#strip!</code>.
5536  *
5537  *     "  hello  ".lstrip   #=> "hello  "
5538  *     "hello".lstrip!      #=> nil
5539  */
5540
5541 static VALUE
5542 rb_str_lstrip_bang(VALUE str)
5543 {
5544     rb_encoding *enc;
5545     char *s, *t, *e;
5546
5547     rb_str_modify(str);
5548     enc = STR_ENC_GET(str);
5549     s = RSTRING_PTR(str);
5550     if (!s || RSTRING_LEN(str) == 0) return Qnil;
5551     e = t = RSTRING_END(str);
5552     /* remove spaces at head */
5553     while (s < e) {
5554         int cc = rb_enc_codepoint(s, e, enc);
5555
5556         if (!rb_enc_isspace(cc, enc)) break;
5557         s += rb_enc_codelen(cc, enc);
5558     }
5559
5560     if (s > RSTRING_PTR(str)) {
5561         rb_str_modify(str);
5562         STR_SET_LEN(str, t-s);
5563         memmove(RSTRING_PTR(str), s, RSTRING_LEN(str));
5564         RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
5565         return str;
5566     }
5567     return Qnil;
5568 }
5569
5570
5571 /*
5572  *  call-seq:
5573  *     str.lstrip   => new_str
5574  *
5575  *  Returns a copy of <i>str</i> with leading whitespace removed. See also
5576  *  <code>String#rstrip</code> and <code>String#strip</code>.
5577  *
5578  *     "  hello  ".lstrip   #=> "hello  "
5579  *     "hello".lstrip       #=> "hello"
5580  */
5581
5582 static VALUE
5583 rb_str_lstrip(VALUE str)
5584 {
5585     str = rb_str_dup(str);
5586     rb_str_lstrip_bang(str);
5587     return str;
5588 }
5589
5590
5591 /*
5592  *  call-seq:
5593  *     str.rstrip!   => self or nil
5594  *
5595  *  Removes trailing whitespace from <i>str</i>, returning <code>nil</code> if
5596  *  no change was made. See also <code>String#lstrip!</code> and
5597  *  <code>String#strip!</code>.
5598  *
5599  *     "  hello  ".rstrip   #=> "  hello"
5600  *     "hello".rstrip!      #=> nil
5601  */
5602
5603 static VALUE
5604 rb_str_rstrip_bang(VALUE str)
5605 {
5606     rb_encoding *enc;
5607     char *s, *t, *e;
5608     int space_seen = Qfalse;
5609
5610     rb_str_modify(str);
5611     enc = STR_ENC_GET(str);
5612     s = RSTRING_PTR(str);
5613     if (!s || RSTRING_LEN(str) == 0) return Qnil;
5614     t = e = RSTRING_END(str);
5615     while (s < e) {
5616         int cc = rb_enc_codepoint(s, e, enc);
5617
5618         if (!cc || rb_enc_isspace(cc, enc)) {
5619             if (!space_seen) t = s;
5620             space_seen = Qtrue;
5621         }
5622         else {
5623             space_seen = Qfalse;
5624         }
5625         s += rb_enc_codelen(cc, enc);
5626     }
5627     if (!space_seen) t = s;
5628     if (t < e) {
5629         rb_str_modify(str);
5630         STR_SET_LEN(str, t-RSTRING_PTR(str));
5631         RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
5632         return str;
5633     }
5634     return Qnil;
5635 }
5636
5637
5638 /*
5639  *  call-seq:
5640  *     str.rstrip   => new_str
5641  *
5642  *  Returns a copy of <i>str</i> with trailing whitespace removed. See also
5643  *  <code>String#lstrip</code> and <code>String#strip</code>.
5644  *
5645  *     "  hello  ".rstrip   #=> "  hello"
5646  *     "hello".rstrip       #=> "hello"
5647  */
5648
5649 static VALUE
5650 rb_str_rstrip(VALUE str)
5651 {
5652     str = rb_str_dup(str);
5653     rb_str_rstrip_bang(str);
5654     return str;
5655 }
5656
5657
5658 /*
5659  *  call-seq:
5660  *     str.strip!   => str or nil
5661  *
5662  *  Removes leading and trailing whitespace from <i>str</i>. Returns
5663  *  <code>nil</code> if <i>str</i> was not altered.
5664  */
5665
5666 static VALUE
5667 rb_str_strip_bang(VALUE str)
5668 {
5669     VALUE l = rb_str_lstrip_bang(str);
5670     VALUE r = rb_str_rstrip_bang(str);
5671
5672     if (NIL_P(l) && NIL_P(r)) return Qnil;
5673     return str;
5674 }
5675
5676
5677 /*
5678  *  call-seq:
5679  *     str.strip   => new_str
5680  *
5681  *  Returns a copy of <i>str</i> with leading and trailing whitespace removed.
5682  *
5683  *     "    hello    ".strip   #=> "hello"
5684  *     "\tgoodbye\r\n".strip   #=> "goodbye"
5685  */
5686
5687 static VALUE
5688 rb_str_strip(VALUE str)
5689 {
5690     str = rb_str_dup(str);
5691     rb_str_strip_bang(str);
5692     return str;
5693 }
5694
5695 static VALUE
5696 scan_once(VALUE str, VALUE pat, long *start)
5697 {
5698     VALUE result, match;
5699     struct re_registers *regs;
5700     long i;
5701
5702     if (rb_reg_search(pat, str, *start, 0) >= 0) {
5703         match = rb_backref_get();
5704         regs = RMATCH_REGS(match);
5705         if (BEG(0) == END(0)) {
5706             rb_encoding *enc = STR_ENC_GET(str);
5707             /*
5708              * Always consume at least one character of the input string
5709              */
5710             if (RSTRING_LEN(str) > END(0))
5711                 *start = END(0)+rb_enc_mbclen(RSTRING_PTR(str)+END(0),
5712                                               RSTRING_END(str), enc);
5713             else
5714                 *start = END(0)+1;
5715         }
5716         else {
5717             *start = END(0);
5718         }
5719         if (regs->num_regs == 1) {
5720             return rb_reg_nth_match(0, match);
5721         }
5722         result = rb_ary_new2(regs->num_regs);
5723         for (i=1; i < regs->num_regs; i++) {
5724             rb_ary_push(result, rb_reg_nth_match(i, match));
5725         }
5726
5727         return result;
5728     }
5729     return Qnil;
5730 }
5731
5732
5733 /*
5734  *  call-seq:
5735  *     str.scan(pattern)                         => array
5736  *     str.scan(pattern) {|match, ...| block }   => str
5737  *
5738  *  Both forms iterate through <i>str</i>, matching the pattern (which may be a
5739  *  <code>Regexp</code> or a <code>String</code>). For each match, a result is
5740  *  generated and either added to the result array or passed to the block. If
5741  *  the pattern contains no groups, each individual result consists of the
5742  *  matched string, <code>$&</code>.  If the pattern contains groups, each
5743  *  individual result is itself an array containing one entry per group.
5744  *
5745  *     a = "cruel world"
5746  *     a.scan(/\w+/)        #=> ["cruel", "world"]
5747  *     a.scan(/.../)        #=> ["cru", "el ", "wor"]
5748  *     a.scan(/(...)/)      #=> [["cru"], ["el "], ["wor"]]
5749  *     a.scan(/(..)(..)/)   #=> [["cr", "ue"], ["l ", "wo"]]
5750  *
5751  *  And the block form:
5752  *
5753  *     a.scan(/\w+/) {|w| print "<<#{w}>> " }
5754  *     print "\n"
5755  *     a.scan(/(.)(.)/) {|x,y| print y, x }
5756  *     print "\n"
5757  *
5758  *  <em>produces:</em>
5759  *
5760  *     <<cruel>> <<world>>
5761  *     rceu lowlr
5762  */
5763
5764 static VALUE
5765 rb_str_scan(VALUE str, VALUE pat)
5766 {
5767     VALUE result;
5768     long start = 0;
5769     long last = -1, prev = 0;
5770     char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
5771
5772     pat = get_pat(pat, 1);
5773     if (!rb_block_given_p()) {
5774         VALUE ary = rb_ary_new();
5775
5776         while (!NIL_P(result = scan_once(str, pat, &start))) {
5777             last = prev;
5778             prev = start;
5779             rb_ary_push(ary, result);
5780         }
5781         if (last >= 0) rb_reg_search(pat, str, last, 0);
5782         return ary;
5783     }
5784
5785     while (!NIL_P(result = scan_once(str, pat, &start))) {
5786         last = prev;
5787         prev = start;
5788         rb_yield(result);
5789         str_mod_check(str, p, len);
5790     }
5791     if (last >= 0) rb_reg_search(pat, str, last, 0);
5792     return str;
5793 }
5794
5795
5796 /*
5797  *  call-seq:
5798  *     str.hex   => integer
5799  *
5800  *  Treats leading characters from <i>str</i> as a string of hexadecimal digits
5801  *  (with an optional sign and an optional <code>0x</code>) and returns the
5802  *  corresponding number. Zero is returned on error.
5803  *
5804  *     "0x0a".hex     #=> 10
5805  *     "-1234".hex    #=> -4660
5806  *     "0".hex        #=> 0
5807  *     "wombat".hex   #=> 0
5808  */
5809
5810 static VALUE
5811 rb_str_hex(VALUE str)
5812 {
5813     rb_encoding *enc = rb_enc_get(str);
5814
5815     if (!rb_enc_asciicompat(enc)) {
5816         rb_raise(rb_eArgError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
5817     }
5818     return rb_str_to_inum(str, 16, Qfalse);
5819 }
5820
5821
5822 /*
5823  *  call-seq:
5824  *     str.oct   => integer
5825  *
5826  *  Treats leading characters of <i>str</i> as a string of octal digits (with an
5827  *  optional sign) and returns the corresponding number.  Returns 0 if the
5828  *  conversion fails.
5829  *
5830  *     "123".oct       #=> 83
5831  *     "-377".oct      #=> -255
5832  *     "bad".oct       #=> 0
5833  *     "0377bad".oct   #=> 255
5834  */
5835
5836 static VALUE
5837 rb_str_oct(VALUE str)
5838 {
5839     rb_encoding *enc = rb_enc_get(str);
5840
5841     if (!rb_enc_asciicompat(enc)) {
5842         rb_raise(rb_eArgError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
5843     }
5844     return rb_str_to_inum(str, -8, Qfalse);
5845 }
5846
5847
5848 /*
5849  *  call-seq:
5850  *     str.crypt(other_str)   => new_str
5851  *
5852  *  Applies a one-way cryptographic hash to <i>str</i> by invoking the standard
5853  *  library function <code>crypt</code>. The argument is the salt string, which
5854  *  should be two characters long, each character drawn from
5855  *  <code>[a-zA-Z0-9./]</code>.
5856  */
5857
5858 static VALUE
5859 rb_str_crypt(VALUE str, VALUE salt)
5860 {
5861     extern char *crypt(const char *, const char *);
5862     VALUE result;
5863     const char *s;
5864
5865     StringValue(salt);
5866     if (RSTRING_LEN(salt) < 2)
5867         rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
5868
5869     if (RSTRING_PTR(str)) s = RSTRING_PTR(str);
5870     else s = "";
5871     result = rb_str_new2(crypt(s, RSTRING_PTR(salt)));
5872     OBJ_INFECT(result, str);
5873     OBJ_INFECT(result, salt);
5874     return result;
5875 }
5876
5877
5878 /*
5879  *  call-seq:
5880  *     str.intern   => symbol
5881  *     str.to_sym   => symbol
5882  *
5883  *  Returns the <code>Symbol</code> corresponding to <i>str</i>, creating the
5884  *  symbol if it did not previously exist. See <code>Symbol#id2name</code>.
5885  *
5886  *     "Koala".intern         #=> :Koala
5887  *     s = 'cat'.to_sym       #=> :cat
5888  *     s == :cat              #=> true
5889  *     s = '@cat'.to_sym      #=> :@cat
5890  *     s == :@cat             #=> true
5891  *
5892  *  This can also be used to create symbols that cannot be represented using the
5893  *  <code>:xxx</code> notation.
5894  *
5895  *     'cat and dog'.to_sym   #=> :"cat and dog"
5896  */
5897
5898 VALUE
5899 rb_str_intern(VALUE s)
5900 {
5901     VALUE str = RB_GC_GUARD(s);
5902     VALUE sym;
5903     ID id, id2;
5904
5905     id = rb_intern_str(str);
5906     sym = ID2SYM(id);
5907     id2 = SYM2ID(sym);
5908     if (id != id2) {
5909         const char *name = rb_id2name(id2);
5910
5911         if (name) {
5912             rb_raise(rb_eRuntimeError, "symbol table overflow (%s given for %s)",
5913                      name, RSTRING_PTR(str));
5914         }
5915         else {
5916             rb_raise(rb_eRuntimeError, "symbol table overflow (symbol %s)",
5917                      RSTRING_PTR(str));
5918         }
5919     }
5920     return sym;
5921 }
5922
5923
5924 /*
5925  *  call-seq:
5926  *     str.ord   => integer
5927  *
5928  *  Return the <code>Integer</code> ordinal of a one-character string.
5929  *
5930  *     "a".ord         #=> 97
5931  */
5932
5933 VALUE
5934 rb_str_ord(VALUE s)
5935 {
5936     int c;
5937
5938     c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s));
5939     return INT2NUM(c);
5940 }
5941 /*
5942  *  call-seq:
5943  *     str.sum(n=16)   => integer
5944  *
5945  *  Returns a basic <em>n</em>-bit checksum of the characters in <i>str</i>,
5946  *  where <em>n</em> is the optional <code>Fixnum</code> parameter, defaulting
5947  *  to 16. The result is simply the sum of the binary value of each character in
5948  *  <i>str</i> modulo <code>2n - 1</code>. This is not a particularly good
5949  *  checksum.
5950  */
5951
5952 static VALUE
5953 rb_str_sum(int argc, VALUE *argv, VALUE str)
5954 {
5955     VALUE vbits;
5956     int bits;
5957     char *ptr, *p, *pend;
5958     long len;
5959
5960     if (argc == 0) {
5961         bits = 16;
5962     }
5963     else {
5964         rb_scan_args(argc, argv, "01", &vbits);
5965         bits = NUM2INT(vbits);
5966     }
5967     ptr = p = RSTRING_PTR(str);
5968     len = RSTRING_LEN(str);
5969     pend = p + len;
5970     if (bits >= sizeof(long)*CHAR_BIT) {
5971         VALUE sum = INT2FIX(0);
5972
5973         while (p < pend) {
5974             str_mod_check(str, ptr, len);
5975             sum = rb_funcall(sum, '+', 1, INT2FIX((unsigned char)*p));
5976             p++;
5977         }
5978         if (bits != 0) {
5979             VALUE mod;
5980
5981             mod = rb_funcall(INT2FIX(1), rb_intern("<<"), 1, INT2FIX(bits));
5982             mod = rb_funcall(mod, '-', 1, INT2FIX(1));
5983             sum = rb_funcall(sum, '&', 1, mod);
5984         }
5985         return sum;
5986     }
5987     else {
5988        unsigned long sum = 0;
5989
5990         while (p < pend) {
5991             str_mod_check(str, ptr, len);
5992             sum += (unsigned char)*p;
5993             p++;
5994         }
5995         if (bits != 0) {
5996            sum &= (((unsigned long)1)<<bits)-1;
5997         }
5998         return rb_int2inum(sum);
5999     }
6000 }
6001
6002 static VALUE
6003 rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
6004 {
6005     rb_encoding *enc;
6006     VALUE w;
6007     long width, len, flen = 1, fclen = 1;
6008     VALUE res;
6009     char *p;
6010     const char *f = " ";
6011     long n, llen, rlen;
6012     volatile VALUE pad;
6013     int singlebyte = 1;
6014
6015     rb_scan_args(argc, argv, "11", &w, &pad);
6016     enc = STR_ENC_GET(str);
6017     width = NUM2LONG(w);
6018     if (argc == 2) {
6019         StringValue(pad);
6020         enc = rb_enc_check(str, pad);
6021         f = RSTRING_PTR(pad);
6022         flen = RSTRING_LEN(pad);
6023         fclen = str_strlen(pad, enc);
6024         singlebyte = single_byte_optimizable(pad);
6025         if (flen == 0 || fclen == 0) {
6026             rb_raise(rb_eArgError, "zero width padding");
6027         }
6028     }
6029     len = str_strlen(str, enc);
6030     if (width < 0 || len >= width) return rb_str_dup(str);
6031     n = width - len;
6032     llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
6033     rlen = n - llen;
6034     res = rb_str_new5(str, 0, RSTRING_LEN(str)+n*flen/fclen+2);
6035     p = RSTRING_PTR(res);
6036     while (llen) {
6037         if (flen <= 1) {
6038             *p++ = *f;
6039             llen--;
6040         }
6041         else if (llen > fclen) {
6042             memcpy(p,f,flen);
6043             p += flen;
6044             llen -= fclen;
6045         }
6046         else {
6047             char *fp = str_nth(f, f+flen, llen, enc, singlebyte);
6048             n = fp - f;
6049             memcpy(p,f,n);
6050             p+=n;
6051             break;
6052         }
6053     }
6054     memcpy(p, RSTRING_PTR(str), RSTRING_LEN(str));
6055     p+=RSTRING_LEN(str);
6056     while (rlen) {
6057         if (flen <= 1) {
6058             *p++ = *f;
6059             rlen--;
6060         }
6061         else if (rlen > fclen) {
6062             memcpy(p,f,flen);
6063             p += flen;
6064             rlen -= fclen;
6065         }
6066         else {
6067             char *fp = str_nth(f, f+flen, rlen, enc, singlebyte);
6068             n = fp - f;
6069             memcpy(p,f,n);
6070             p+=n;
6071             break;
6072         }
6073     }
6074     *p = '\0';
6075     STR_SET_LEN(res, p-RSTRING_PTR(res));
6076     OBJ_INFECT(res, str);
6077     if (!NIL_P(pad)) OBJ_INFECT(res, pad);
6078     rb_enc_associate(res, enc);
6079     return res;
6080 }
6081
6082
6083 /*
6084  *  call-seq:
6085  *     str.ljust(integer, padstr=' ')   => new_str
6086  *
6087  *  If <i>integer</i> is greater than the length of <i>str</i>, returns a new
6088  *  <code>String</code> of length <i>integer</i> with <i>str</i> left justified
6089  *  and padded with <i>padstr</i>; otherwise, returns <i>str</i>.
6090  *
6091  *     "hello".ljust(4)            #=> "hello"
6092  *     "hello".ljust(20)           #=> "hello               "
6093  *     "hello".ljust(20, '1234')   #=> "hello123412341234123"
6094  */
6095
6096 static VALUE
6097 rb_str_ljust(int argc, VALUE *argv, VALUE str)
6098 {
6099     return rb_str_justify(argc, argv, str, 'l');
6100 }
6101
6102
6103 /*
6104  *  call-seq:
6105  *     str.rjust(integer, padstr=' ')   => new_str
6106  *
6107  *  If <i>integer</i> is greater than the length of <i>str</i>, returns a new
6108  *  <code>String</code> of length <i>integer</i> with <i>str</i> right justified
6109  *  and padded with <i>padstr</i>; otherwise, returns <i>str</i>.
6110  *
6111  *     "hello".rjust(4)            #=> "hello"
6112  *     "hello".rjust(20)           #=> "               hello"
6113  *     "hello".rjust(20, '1234')   #=> "123412341234123hello"
6114  */
6115
6116 static VALUE
6117 rb_str_rjust(int argc, VALUE *argv, VALUE str)
6118 {
6119     return rb_str_justify(argc, argv, str, 'r');
6120 }
6121
6122
6123 /*
6124  *  call-seq:
6125  *     str.center(integer, padstr)   => new_str
6126  *
6127  *  If <i>integer</i> is greater than the length of <i>str</i>, returns a new
6128  *  <code>String</code> of length <i>integer</i> with <i>str</i> centered and
6129  *  padded with <i>padstr</i>; otherwise, returns <i>str</i>.
6130  *
6131  *     "hello".center(4)         #=> "hello"
6132  *     "hello".center(20)        #=> "       hello        "
6133  *     "hello".center(20, '123') #=> "1231231hello12312312"
6134  */
6135
6136 static VALUE
6137 rb_str_center(int argc, VALUE *argv, VALUE str)
6138 {
6139     return rb_str_justify(argc, argv, str, 'c');
6140 }
6141
6142 /*
6143  *  call-seq:
6144  *     str.partition(sep)              => [head, sep, tail]
6145  *
6146  *  Searches the string for <i>sep</i> and returns the part before
6147  *  it, the <i>sep</i>, and the part after it.  If <i>sep</i> is not found,
6148  *  returns <i>str</i> and two empty strings.
6149  *
6150  *     "hello".partition("l")         #=> ["he", "l", "lo"]
6151  *     "hello".partition("x")         #=> ["hello", "", ""]
6152  */
6153
6154 static VALUE
6155 rb_str_partition(VALUE str, VALUE sep)
6156 {
6157     long pos;
6158     int regex = Qfalse;
6159
6160     if (TYPE(sep) == T_REGEXP) {
6161         pos = rb_reg_search(sep, str, 0, 0);
6162         regex = Qtrue;
6163     }
6164     else {
6165         VALUE tmp;
6166
6167         tmp = rb_check_string_type(sep);
6168         if (NIL_P(tmp)) {
6169             rb_raise(rb_eTypeError, "type mismatch: %s given",
6170                      rb_obj_classname(sep));
6171         }
6172         pos = rb_str_index(str, sep, 0);
6173     }
6174     if (pos < 0) {
6175       failed:
6176         return rb_ary_new3(3, str, rb_str_new(0,0),rb_str_new(0,0));
6177     }
6178     if (regex) {
6179         sep = rb_str_subpat(str, sep, 0);
6180         if (pos == 0 && RSTRING_LEN(sep) == 0) goto failed;
6181     }
6182     return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
6183                           sep,
6184                           rb_str_subseq(str, pos+RSTRING_LEN(sep),
6185                                              RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
6186 }
6187
6188 /*
6189  *  call-seq:
6190  *     str.rpartition(sep)            => [head, sep, tail]
6191  *
6192  *  Searches <i>sep</i> in the string from the end of the string, and
6193  *  returns the part before it, the <i>sep</i>, and the part after it.
6194  *  If <i>sep</i> is not found, returns two empty strings and
6195  *  <i>str</i>.
6196  *
6197  *     "hello".rpartition("l")         #=> ["hel", "l", "o"]
6198  *     "hello".rpartition("x")         #=> ["", "", "hello"]
6199  */
6200
6201 static VALUE
6202 rb_str_rpartition(VALUE str, VALUE sep)
6203 {
6204     long pos = RSTRING_LEN(str);
6205     int regex = Qfalse;
6206
6207     if (TYPE(sep) == T_REGEXP) {
6208         pos = rb_reg_search(sep, str, pos, 1);
6209         regex = Qtrue;
6210     }
6211     else {
6212         VALUE tmp;
6213
6214         tmp = rb_check_string_type(sep);
6215         if (NIL_P(tmp)) {
6216             rb_raise(rb_eTypeError, "type mismatch: %s given",
6217                      rb_obj_classname(sep));
6218         }
6219         pos = rb_str_sublen(str, pos);
6220         pos = rb_str_rindex(str, sep, pos);
6221     }
6222     if (pos < 0) {
6223         return rb_ary_new3(3, rb_str_new(0,0),rb_str_new(0,0), str);
6224     }
6225     if (regex) {
6226         sep = rb_reg_nth_match(0, rb_backref_get());
6227     }
6228     return rb_ary_new3(3, rb_str_substr(str, 0, pos),
6229                           sep,
6230                           rb_str_substr(str,pos+str_strlen(sep,STR_ENC_GET(sep)),RSTRING_LEN(str)));
6231 }
6232
6233 /*
6234  *  call-seq:
6235  *     str.start_with?([prefix]+)   => true or false
6236  *
6237  *  Returns true if <i>str</i> starts with the prefix given.
6238  */
6239
6240 static VALUE
6241 rb_str_start_with(int argc, VALUE *argv, VALUE str)
6242 {
6243     int i;
6244
6245     for (i=0; i<argc; i++) {
6246         VALUE tmp = rb_check_string_type(argv[i]);
6247         if (NIL_P(tmp)) continue;
6248         rb_enc_check(str, tmp);
6249         if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
6250         if (memcmp(RSTRING_PTR(str), RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
6251             return Qtrue;
6252     }
6253     return Qfalse;
6254 }
6255
6256 /*
6257  *  call-seq:
6258  *     str.end_with?([suffix]+)   => true or false
6259  *
6260  *  Returns true if <i>str</i> ends with the suffix given.
6261  */
6262
6263 static VALUE
6264 rb_str_end_with(int argc, VALUE *argv, VALUE str)
6265 {
6266     int i;
6267     char *p, *s;
6268     rb_encoding *enc;
6269
6270     for (i=0; i<argc; i++) {
6271         VALUE tmp = rb_check_string_type(argv[i]);
6272         if (NIL_P(tmp)) continue;
6273         enc = rb_enc_check(str, tmp);
6274         if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
6275         p = RSTRING_PTR(str);
6276         s = p + RSTRING_LEN(str) - RSTRING_LEN(tmp);
6277         if (rb_enc_left_char_head(p, s, enc) != s)
6278             continue;
6279         if (memcmp(s, RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
6280             return Qtrue;
6281     }
6282     return Qfalse;
6283 }
6284
6285 void
6286 rb_str_setter(VALUE val, ID id, VALUE *var)
6287 {
6288     if (!NIL_P(val) && TYPE(val) != T_STRING) {
6289         rb_raise(rb_eTypeError, "value of %s must be String", rb_id2name(id));
6290     }
6291     *var = val;
6292 }
6293
6294
6295 /*
6296  *  call-seq:
6297  *     str.force_encoding(encoding)   => str
6298  *
6299  *  Changes the encoding to +encoding+ and returns self.
6300  */
6301
6302 static VALUE
6303 rb_str_force_encoding(VALUE str, VALUE enc)
6304 {
6305     str_modifiable(str);
6306     rb_enc_associate(str, rb_to_encoding(enc));
6307     return str;
6308 }
6309
6310 /*
6311  *  call-seq:
6312  *     str.valid_encoding?  => true or false
6313  *
6314  *  Returns true for a string which encoded correctly.
6315  *
6316  *    "\xc2\xa1".force_encoding("UTF-8").valid_encoding? => true
6317  *    "\xc2".force_encoding("UTF-8").valid_encoding? => false
6318  *    "\x80".force_encoding("UTF-8").valid_encoding? => false
6319  */
6320
6321 static VALUE
6322 rb_str_valid_encoding_p(VALUE str)
6323 {
6324     int cr = rb_enc_str_coderange(str);
6325
6326     return cr == ENC_CODERANGE_BROKEN ? Qfalse : Qtrue;
6327 }
6328
6329 /*
6330  *  call-seq:
6331  *     str.ascii_only?  => true or false
6332  *
6333  *  Returns true for a string which has only ASCII characters.
6334  *
6335  *    "abc".force_encoding("UTF-8").ascii_only? => true
6336  *    "abc\u{6666}".force_encoding("UTF-8").ascii_only? => false
6337  */
6338
6339 static VALUE
6340 rb_str_is_ascii_only_p(VALUE str)
6341 {
6342     int cr = rb_enc_str_coderange(str);
6343
6344     return cr == ENC_CODERANGE_7BIT ? Qtrue : Qfalse;
6345 }
6346
6347 /**********************************************************************
6348  * Document-class: Symbol
6349  *
6350  *  <code>Symbol</code> objects represent names and some strings
6351  *  inside the Ruby
6352  *  interpreter. They are generated using the <code>:name</code> and
6353  *  <code>:"string"</code> literals
6354  *  syntax, and by the various <code>to_sym</code> methods. The same
6355  *  <code>Symbol</code> object will be created for a given name or string
6356  *  for the duration of a program's execution, regardless of the context
6357  *  or meaning of that name. Thus if <code>Fred</code> is a constant in
6358  *  one context, a method in another, and a class in a third, the
6359  *  <code>Symbol</code> <code>:Fred</code> will be the same object in
6360  *  all three contexts.
6361  *
6362  *     module One
6363  *       class Fred
6364  *       end
6365  *       $f1 = :Fred
6366  *     end
6367  *     module Two
6368  *       Fred = 1
6369  *       $f2 = :Fred
6370  *     end
6371  *     def Fred()
6372  *     end
6373  *     $f3 = :Fred
6374  *     $f1.object_id   #=> 2514190
6375  *     $f2.object_id   #=> 2514190
6376  *     $f3.object_id   #=> 2514190
6377  *
6378  */
6379
6380
6381 /*
6382  *  call-seq:
6383  *     sym == obj   => true or false
6384  *
6385  *  Equality---If <i>sym</i> and <i>obj</i> are exactly the same
6386  *  symbol, returns <code>true</code>. Otherwise, compares them
6387  *  as strings.
6388  */
6389
6390 static VALUE
6391 sym_equal(VALUE sym1, VALUE sym2)
6392 {
6393     if (sym1 == sym2) return Qtrue;
6394     return Qfalse;
6395 }
6396
6397
6398 /*
6399  *  call-seq:
6400  *     sym.inspect    => string
6401  *
6402  *  Returns the representation of <i>sym</i> as a symbol literal.
6403  *
6404  *     :fred.inspect   #=> ":fred"
6405  */
6406
6407 static VALUE
6408 sym_inspect(VALUE sym)
6409 {
6410     VALUE str;
6411     ID id = SYM2ID(sym);
6412     rb_encoding *enc;
6413
6414     sym = rb_id2str(id);
6415     enc = STR_ENC_GET(sym);
6416     str = rb_enc_str_new(0, RSTRING_LEN(sym)+1, enc);
6417     RSTRING_PTR(str)[0] = ':';
6418     memcpy(RSTRING_PTR(str)+1, RSTRING_PTR(sym), RSTRING_LEN(sym));
6419     if (RSTRING_LEN(sym) != strlen(RSTRING_PTR(sym)) ||
6420         !rb_enc_symname_p(RSTRING_PTR(sym), enc)) {
6421         str = rb_str_inspect(str);
6422         strncpy(RSTRING_PTR(str), ":\"", 2);
6423     }
6424     return str;
6425 }
6426
6427
6428 /*
6429  *  call-seq:
6430  *     sym.id2name   => string
6431  *     sym.to_s      => string
6432  *
6433  *  Returns the name or string corresponding to <i>sym</i>.
6434  *
6435  *     :fred.id2name   #=> "fred"
6436  */
6437
6438
6439 VALUE
6440 rb_sym_to_s(VALUE sym)
6441 {
6442     ID id = SYM2ID(sym);
6443
6444     return str_new3(rb_cString, rb_id2str(id));
6445 }
6446
6447
6448 /*
6449  * call-seq:
6450  *   sym.to_sym   => sym
6451  *   sym.intern   => sym
6452  *
6453  * In general, <code>to_sym</code> returns the <code>Symbol</code> corresponding
6454  * to an object. As <i>sym</i> is already a symbol, <code>self</code> is returned
6455  * in this case.
6456  */
6457
6458 static VALUE
6459 sym_to_sym(VALUE sym)
6460 {
6461     return sym;
6462 }
6463
6464 static VALUE
6465 sym_call(VALUE args, VALUE sym, int argc, VALUE *argv)
6466 {
6467     VALUE obj;
6468
6469     if (argc < 1) {
6470         rb_raise(rb_eArgError, "no receiver given");
6471     }
6472     obj = argv[0];
6473     return rb_funcall3(obj, (ID)sym, argc - 1, argv + 1);
6474 }
6475
6476 /*
6477  * call-seq:
6478  *   sym.to_proc
6479  *
6480  * Returns a _Proc_ object which respond to the given method by _sym_.
6481  *
6482  *   (1..3).collect(&:to_s)  #=> ["1", "2", "3"]
6483  */
6484
6485 static VALUE
6486 sym_to_proc(VALUE sym)
6487 {
6488     return rb_proc_new(sym_call, (VALUE)SYM2ID(sym));
6489 }
6490
6491
6492 static VALUE
6493 sym_succ(VALUE sym)
6494 {
6495     return rb_str_intern(rb_str_succ(rb_sym_to_s(sym)));
6496 }
6497
6498 static VALUE
6499 sym_cmp(VALUE sym, VALUE other)
6500 {
6501     if (!SYMBOL_P(other)) {
6502         return Qnil;
6503     }
6504     return rb_str_cmp_m(rb_sym_to_s(sym), rb_sym_to_s(other));
6505 }
6506
6507 static VALUE
6508 sym_casecmp(VALUE sym, VALUE other)
6509 {
6510     if (!SYMBOL_P(other)) {
6511         return Qnil;
6512     }
6513     return rb_str_casecmp(rb_sym_to_s(sym), rb_sym_to_s(other));
6514 }
6515
6516 static VALUE
6517 sym_match(VALUE sym, VALUE other)
6518 {
6519     return rb_str_match(rb_sym_to_s(sym), other);
6520 }
6521
6522 static VALUE
6523 sym_eqq(VALUE sym, VALUE other)
6524 {
6525     if (sym == other) return Qtrue;
6526     return rb_str_equal(rb_sym_to_s(sym), other);
6527 }
6528
6529 static VALUE
6530 sym_aref(int argc, VALUE *argv, VALUE sym)
6531 {
6532     return rb_str_aref_m(argc, argv, rb_sym_to_s(sym));
6533 }
6534
6535 static VALUE
6536 sym_length(VALUE sym)
6537 {
6538     return rb_str_length(rb_id2str(SYM2ID(sym)));
6539 }
6540
6541 static VALUE
6542 sym_empty(VALUE sym)
6543 {
6544     return rb_str_empty(rb_id2str(SYM2ID(sym)));
6545 }
6546
6547 static VALUE
6548 sym_upcase(VALUE sym)
6549 {
6550     return rb_str_intern(rb_str_upcase(rb_id2str(SYM2ID(sym))));
6551 }
6552
6553 static VALUE
6554 sym_downcase(VALUE sym)
6555 {
6556     return rb_str_intern(rb_str_downcase(rb_id2str(SYM2ID(sym))));
6557 }
6558
6559 static VALUE
6560 sym_capitalize(VALUE sym)
6561 {
6562     return rb_str_intern(rb_str_capitalize(rb_id2str(SYM2ID(sym))));
6563 }
6564
6565 static VALUE
6566 sym_swapcase(VALUE sym)
6567 {
6568     return rb_str_intern(rb_str_swapcase(rb_id2str(SYM2ID(sym))));
6569 }
6570
6571 static VALUE
6572 sym_encoding(VALUE sym)
6573 {
6574     return rb_obj_encoding(rb_id2str(SYM2ID(sym)));
6575 }
6576
6577 ID
6578 rb_to_id(VALUE name)
6579 {
6580     VALUE tmp;
6581     ID id;
6582
6583     switch (TYPE(name)) {
6584       default:
6585         tmp = rb_check_string_type(name);
6586         if (NIL_P(tmp)) {
6587             rb_raise(rb_eTypeError, "%s is not a symbol",
6588                      RSTRING_PTR(rb_inspect(name)));
6589         }
6590         name = tmp;
6591         /* fall through */
6592       case T_STRING:
6593         name = rb_str_intern(name);
6594         /* fall through */
6595       case T_SYMBOL:
6596         return SYM2ID(name);
6597     }
6598     return id;
6599 }
6600
6601 /*
6602  *  A <code>String</code> object holds and manipulates an arbitrary sequence of
6603  *  bytes, typically representing characters. String objects may be created
6604  *  using <code>String::new</code> or as literals.
6605  *
6606  *  Because of aliasing issues, users of strings should be aware of the methods
6607  *  that modify the contents of a <code>String</code> object.  Typically,
6608  *  methods with names ending in ``!'' modify their receiver, while those
6609  *  without a ``!'' return a new <code>String</code>.  However, there are
6610  *  exceptions, such as <code>String#[]=</code>.
6611  *
6612  */
6613
6614 void
6615 Init_String(void)
6616 {
6617 #undef rb_intern
6618
6619     rb_cString  = rb_define_class("String", rb_cObject);
6620     rb_include_module(rb_cString, rb_mComparable);
6621     rb_define_alloc_func(rb_cString, str_alloc);
6622     rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1);
6623     rb_define_method(rb_cString, "initialize", rb_str_init, -1);
6624     rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
6625     rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1);
6626     rb_define_method(rb_cString, "==", rb_str_equal, 1);
6627     rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
6628     rb_define_method(rb_cString, "hash", rb_str_hash_m, 0);
6629     rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
6630     rb_define_method(rb_cString, "+", rb_str_plus, 1);
6631     rb_define_method(rb_cString, "*", rb_str_times, 1);
6632     rb_define_method(rb_cString, "%", rb_str_format_m, 1);
6633     rb_define_method(rb_cString, "[]", rb_str_aref_m, -1);
6634     rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1);
6635     rb_define_method(rb_cString, "insert", rb_str_insert, 2);
6636     rb_define_method(rb_cString, "length", rb_str_length, 0);
6637     rb_define_method(rb_cString, "size", rb_str_length, 0);
6638     rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
6639     rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
6640     rb_define_method(rb_cString, "=~", rb_str_match, 1);
6641     rb_define_method(rb_cString, "match", rb_str_match_m, -1);
6642     rb_define_method(rb_cString, "succ", rb_str_succ, 0);
6643     rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0);
6644     rb_define_method(rb_cString, "next", rb_str_succ, 0);
6645     rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0);
6646     rb_define_method(rb_cString, "upto", rb_str_upto, -1);
6647     rb_define_method(rb_cString, "index", rb_str_index_m, -1);
6648     rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1);
6649     rb_define_method(rb_cString, "replace", rb_str_replace, 1);
6650     rb_define_method(rb_cString, "clear", rb_str_clear, 0);
6651     rb_define_method(rb_cString, "chr", rb_str_chr, 0);
6652     rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
6653     rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
6654
6655     rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
6656     rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
6657     rb_define_method(rb_cString, "to_s", rb_str_to_s, 0);
6658     rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
6659     rb_define_method(rb_cString, "inspect", rb_str_inspect, 0);
6660     rb_define_method(rb_cString, "dump", rb_str_dump, 0);
6661
6662     rb_define_method(rb_cString, "upcase", rb_str_upcase, 0);
6663     rb_define_method(rb_cString, "downcase", rb_str_downcase, 0);
6664     rb_define_method(rb_cString, "capitalize", rb_str_capitalize, 0);
6665     rb_define_method(rb_cString, "swapcase", rb_str_swapcase, 0);
6666
6667     rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, 0);
6668     rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, 0);
6669     rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, 0);
6670     rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, 0);
6671
6672     rb_define_method(rb_cString, "hex", rb_str_hex, 0);
6673     rb_define_method(rb_cString, "oct", rb_str_oct, 0);
6674     rb_define_method(rb_cString, "split", rb_str_split_m, -1);
6675     rb_define_method(rb_cString, "lines", rb_str_each_line, -1);
6676     rb_define_method(rb_cString, "bytes", rb_str_each_byte, 0);
6677     rb_define_method(rb_cString, "chars", rb_str_each_char, 0);
6678     rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
6679     rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
6680     rb_define_method(rb_cString, "concat", rb_str_concat, 1);
6681     rb_define_method(rb_cString, "<<", rb_str_concat, 1);
6682     rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
6683     rb_define_method(rb_cString, "intern", rb_str_intern, 0);
6684     rb_define_method(rb_cString, "to_sym", rb_str_intern, 0);
6685     rb_define_method(rb_cString, "ord", rb_str_ord, 0);
6686
6687     rb_define_method(rb_cString, "include?", rb_str_include, 1);
6688     rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
6689     rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
6690
6691     rb_define_method(rb_cString, "scan", rb_str_scan, 1);
6692
6693     rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
6694     rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
6695     rb_define_method(rb_cString, "center", rb_str_center, -1);
6696
6697     rb_define_method(rb_cString, "sub", rb_str_sub, -1);
6698     rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
6699     rb_define_method(rb_cString, "chop", rb_str_chop, 0);
6700     rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
6701     rb_define_method(rb_cString, "strip", rb_str_strip, 0);
6702     rb_define_method(rb_cString, "lstrip", rb_str_lstrip, 0);
6703     rb_define_method(rb_cString, "rstrip", rb_str_rstrip, 0);
6704
6705     rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1);
6706     rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1);
6707     rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0);
6708     rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1);
6709     rb_define_method(rb_cString, "strip!", rb_str_strip_bang, 0);
6710     rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, 0);
6711     rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, 0);
6712
6713     rb_define_method(rb_cString, "tr", rb_str_tr, 2);
6714     rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2);
6715     rb_define_method(rb_cString, "delete", rb_str_delete, -1);
6716     rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
6717     rb_define_method(rb_cString, "count", rb_str_count, -1);
6718
6719     rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2);
6720     rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2);
6721     rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1);
6722     rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1);
6723
6724     rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
6725     rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
6726     rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
6727
6728     rb_define_method(rb_cString, "sum", rb_str_sum, -1);
6729
6730     rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
6731     rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1);
6732
6733     rb_define_method(rb_cString, "partition", rb_str_partition, 1);
6734     rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
6735
6736     rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */
6737     rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
6738     rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
6739     rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0);
6740
6741     id_to_s = rb_intern("to_s");
6742
6743     rb_fs = Qnil;
6744     rb_define_variable("$;", &rb_fs);
6745     rb_define_variable("$-F", &rb_fs);
6746
6747     rb_cSymbol = rb_define_class("Symbol", rb_cObject);
6748     rb_include_module(rb_cSymbol, rb_mComparable);
6749     rb_undef_alloc_func(rb_cSymbol);
6750     rb_undef_method(CLASS_OF(rb_cSymbol), "new");
6751     rb_define_singleton_method(rb_cSymbol, "all_symbols", rb_sym_all_symbols, 0); /* in parse.y */
6752
6753     rb_define_method(rb_cSymbol, "==", sym_equal, 1);
6754     rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
6755     rb_define_method(rb_cSymbol, "to_s", rb_sym_to_s, 0);
6756     rb_define_method(rb_cSymbol, "id2name", rb_sym_to_s, 0);
6757     rb_define_method(rb_cSymbol, "intern", sym_to_sym, 0);
6758     rb_define_method(rb_cSymbol, "to_sym", sym_to_sym, 0);
6759     rb_define_method(rb_cSymbol, "to_proc", sym_to_proc, 0);
6760     rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
6761     rb_define_method(rb_cSymbol, "next", sym_succ, 0);
6762
6763     rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
6764     rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
6765     rb_define_method(rb_cSymbol, "=~", sym_match, 1);
6766     rb_define_method(rb_cSymbol, "===", sym_eqq, 1);
6767
6768     rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
6769     rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
6770     rb_define_method(rb_cSymbol, "length", sym_length, 0);
6771     rb_define_method(rb_cSymbol, "size", sym_length, 0);
6772     rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
6773     rb_define_method(rb_cSymbol, "match", sym_match, 1);
6774
6775     rb_define_method(rb_cSymbol, "upcase", sym_upcase, 0);
6776     rb_define_method(rb_cSymbol, "downcase", sym_downcase, 0);
6777     rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, 0);
6778     rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, 0);
6779
6780     rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);
6781 }