string.c

   1 /**********************************************************************
   2
   3   string.c -
   4
   5   $Author$
   6   created at: Mon Aug  9 17:12:58 JST 1993
   7
   8   Copyright (C) 1993-2007 Yukihiro Matsumoto
   9   Copyright (C) 2000  Network Applied Communication Laboratory, Inc.
  10   Copyright (C) 2000  Information-technology Promotion Agency, Japan
  11
  12 **********************************************************************/
  13
  14 #include "ruby/ruby.h"
  15 #include "ruby/re.h"
  16 #include "ruby/encoding.h"
  17
  18 #define BEG(no) regs->beg[no]
  19 #define END(no) regs->end[no]
  20
  21 #include <math.h>
  22 #include <ctype.h>
  23
  24 #ifdef HAVE_UNISTD_H
  25 #include <unistd.h>
  26 #endif
  27
  28 #undef rb_str_new_cstr
  29 #undef rb_tainted_str_new_cstr
  30 #undef rb_usascii_str_new_cstr
  31 #undef rb_str_new2
  32 #undef rb_str_new3
  33 #undef rb_str_new4
  34 #undef rb_str_new5
  35 #undef rb_tainted_str_new2
  36 #undef rb_usascii_str_new2
  37 #undef rb_str_dup_frozen
  38 #undef rb_str_buf_new_cstr
  39 #undef rb_str_buf_new2
  40 #undef rb_str_buf_cat2
  41 #undef rb_str_cat2
  42
  43 VALUE rb_cString;
  44 VALUE rb_cSymbol;
  45
  46 #define STR_TMPLOCK FL_USER7
  47 #define STR_NOEMBED FL_USER1
  48 #define STR_SHARED  FL_USER2 /* = ELTS_SHARED */
  49 #define STR_ASSOC   FL_USER3
  50 #define STR_SHARED_P(s) FL_ALL(s, STR_NOEMBED|ELTS_SHARED)
  51 #define STR_ASSOC_P(s)  FL_ALL(s, STR_NOEMBED|STR_ASSOC)
  52 #define STR_NOCAPA  (STR_NOEMBED|ELTS_SHARED|STR_ASSOC)
  53 #define STR_NOCAPA_P(s) (FL_TEST(s,STR_NOEMBED) && FL_ANY(s,ELTS_SHARED|STR_ASSOC))
  54 #define STR_UNSET_NOCAPA(s) do {\
  55     if (FL_TEST(s,STR_NOEMBED)) FL_UNSET(s,(ELTS_SHARED|STR_ASSOC));\
  56 } while (0)
  57
  58
  59 #define STR_SET_NOEMBED(str) do {\
  60     FL_SET(str, STR_NOEMBED);\
  61     STR_SET_EMBED_LEN(str, 0);\
  62 } while (0)
  63 #define STR_SET_EMBED(str) FL_UNSET(str, STR_NOEMBED)
  64 #define STR_EMBED_P(str) (!FL_TEST(str, STR_NOEMBED))
  65 #define STR_SET_EMBED_LEN(str, n) do { \
  66     long tmp_n = (n);\
  67     RBASIC(str)->flags &= ~RSTRING_EMBED_LEN_MASK;\
  68     RBASIC(str)->flags |= (tmp_n) << RSTRING_EMBED_LEN_SHIFT;\
  69 } while (0)
  70
  71 #define STR_SET_LEN(str, n) do { \
  72     if (STR_EMBED_P(str)) {\
  73         STR_SET_EMBED_LEN(str, n);\
  74     }\
  75     else {\
  76         RSTRING(str)->as.heap.len = (n);\
  77     }\
  78 } while (0)
  79
  80 #define STR_DEC_LEN(str) do {\
  81     if (STR_EMBED_P(str)) {\
  82         long n = RSTRING_LEN(str);\
  83         n--;\
  84         STR_SET_EMBED_LEN(str, n);\
  85     }\
  86     else {\
  87         RSTRING(str)->as.heap.len--;\
  88     }\
  89 } while (0)
  90
  91 #define RESIZE_CAPA(str,capacity) do {\
  92     if (STR_EMBED_P(str)) {\
  93         if ((capacity) > RSTRING_EMBED_LEN_MAX) {\
  94             char *tmp = ALLOC_N(char, capacity+1);\
  95             memcpy(tmp, RSTRING_PTR(str), RSTRING_LEN(str));\
  96             RSTRING(str)->as.heap.ptr = tmp;\
  97             RSTRING(str)->as.heap.len = RSTRING_LEN(str);\
  98             STR_SET_NOEMBED(str);\
  99             RSTRING(str)->as.heap.aux.capa = (capacity);\
 100         }\
 101     }\
 102     else {\
 103         REALLOC_N(RSTRING(str)->as.heap.ptr, char, (capacity)+1);\
 104         if (!STR_NOCAPA_P(str))\
 105             RSTRING(str)->as.heap.aux.capa = (capacity);\
 106     }\
 107 } while (0)
 108
 109 #define is_ascii_string(str) (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT)
 110 #define is_broken_string(str) (rb_enc_str_coderange(str) == ENC_CODERANGE_BROKEN)
 111
 112 #define STR_ENC_GET(str) rb_enc_from_index(ENCODING_GET(str))
 113
 114 static int
 115 single_byte_optimizable(VALUE str)
 116 {
 117     rb_encoding *enc = STR_ENC_GET(str);
 118
 119     if (rb_enc_mbmaxlen(enc) == 1)
 120         return 1;
 121
 122     /* Conservative.  It may be ENC_CODERANGE_UNKNOWN. */
 123     if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT)
 124         return 1;
 125
 126     /* Conservative.  Possibly single byte.
 127      * "\xa1" in Shift_JIS for example. */
 128     return 0;
 129 }
 130
 131 VALUE rb_fs;
 132
 133 static inline const char *
 134 search_nonascii(const char *p, const char *e)
 135 {
 136 #if SIZEOF_VALUE == 8
 137 # define NONASCII_MASK 0x8080808080808080LL
 138 #elif SIZEOF_VALUE == 4
 139 # define NONASCII_MASK 0x80808080UL
 140 #endif
 141 #ifdef NONASCII_MASK
 142     if (sizeof(VALUE) * 2 < e - p) {
 143         const VALUE *s, *t;
 144         const VALUE lowbits = sizeof(VALUE) - 1;
 145         s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
 146         while (p < (const char *)s) {
 147             if (!ISASCII(*p))
 148                 return p;
 149             p++;
 150         }
 151         t = (const VALUE*)(~lowbits & (VALUE)e);
 152         while (s < t) {
 153             if (*s & NONASCII_MASK) {
 154                 t = s;
 155                 break;
 156             }
 157             s++;
 158         }
 159         p = (const char *)t;
 160     }
 161 #endif
 162     while (p < e) {
 163         if (!ISASCII(*p))
 164             return p;
 165         p++;
 166     }
 167     return NULL;
 168 }
 169
 170 static int
 171 coderange_scan(const char *p, long len, rb_encoding *enc)
 172 {
 173     const char *e = p + len;
 174
 175     if (rb_enc_to_index(enc) == 0) {
 176         /* enc is ASCII-8BIT.  ASCII-8BIT string never be broken. */
 177         p = search_nonascii(p, e);
 178         return p ? ENC_CODERANGE_VALID : ENC_CODERANGE_7BIT;
 179     }
 180
 181     if (rb_enc_asciicompat(enc)) {
 182         p = search_nonascii(p, e);
 183         if (!p) {
 184             return ENC_CODERANGE_7BIT;
 185         }
 186         while (p < e) {
 187             int ret = rb_enc_precise_mbclen(p, e, enc);
 188             if (!MBCLEN_CHARFOUND_P(ret)) {
 189                 return ENC_CODERANGE_BROKEN;
 190             }
 191             p += MBCLEN_CHARFOUND_LEN(ret);
 192             if (p < e) {
 193                 p = search_nonascii(p, e);
 194                 if (!p) {
 195                     return ENC_CODERANGE_VALID;
 196                 }
 197             }
 198         }
 199         if (e < p) {
 200             return ENC_CODERANGE_BROKEN;
 201         }
 202         return ENC_CODERANGE_VALID;
 203     }
 204
 205     while (p < e) {
 206         int ret = rb_enc_precise_mbclen(p, e, enc);
 207
 208         if (!MBCLEN_CHARFOUND_P(ret)) {
 209             return ENC_CODERANGE_BROKEN;
 210         }
 211         p += MBCLEN_CHARFOUND_LEN(ret);
 212     }
 213     if (e < p) {
 214         return ENC_CODERANGE_BROKEN;
 215     }
 216     return ENC_CODERANGE_VALID;
 217 }
 218
 219 long
 220 rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
 221 {
 222     const char *p = s;
 223
 224     if (*cr == ENC_CODERANGE_BROKEN)
 225         return e - s;
 226
 227     if (rb_enc_to_index(enc) == 0) {
 228         /* enc is ASCII-8BIT.  ASCII-8BIT string never be broken. */
 229         p = search_nonascii(p, e);
 230         *cr = (!p && *cr != ENC_CODERANGE_VALID) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
 231         return e - s;
 232     }
 233     else if (rb_enc_asciicompat(enc)) {
 234         p = search_nonascii(p, e);
 235         if (!p) {
 236             if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
 237             return e - s;
 238         }
 239         while (p < e) {
 240             int ret = rb_enc_precise_mbclen(p, e, enc);
 241             if (!MBCLEN_CHARFOUND_P(ret)) {
 242                 *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN;
 243                 return p - s;
 244             }
 245             p += MBCLEN_CHARFOUND_LEN(ret);
 246             if (p < e) {
 247                 p = search_nonascii(p, e);
 248                 if (!p) {
 249                     *cr = ENC_CODERANGE_VALID;
 250                     return e - s;
 251                 }
 252             }
 253         }
 254         *cr = e < p ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_VALID;
 255         return p - s;
 256     }
 257     else {
 258         while (p < e) {
 259             int ret = rb_enc_precise_mbclen(p, e, enc);
 260             if (!MBCLEN_CHARFOUND_P(ret)) {
 261                 *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN;
 262                 return p - s;
 263             }
 264             p += MBCLEN_CHARFOUND_LEN(ret);
 265         }
 266         *cr = e < p ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_VALID;
 267         return p - s;
 268     }
 269 }
 270
 271 static inline void
 272 str_enc_copy(VALUE str1, VALUE str2)
 273 {
 274     rb_enc_set_index(str1, ENCODING_GET(str2));
 275 }
 276
 277 static void
 278 rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
 279 {
 280     /* this function is designed for copying encoding and coderange
 281      * from src to new string "dest" which is made from the part of src.
 282      */
 283     str_enc_copy(dest, src);
 284     switch (ENC_CODERANGE(src)) {
 285       case ENC_CODERANGE_7BIT:
 286         ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
 287         break;
 288       case ENC_CODERANGE_VALID:
 289         if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
 290             search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
 291             ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID);
 292         else
 293             ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
 294         break;
 295       default:
 296         if (RSTRING_LEN(dest) == 0) {
 297             if (!rb_enc_asciicompat(STR_ENC_GET(src)))
 298                 ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID);
 299             else
 300                 ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
 301         }
 302         break;
 303     }
 304 }
 305
 306 static void
 307 rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
 308 {
 309     str_enc_copy(dest, src);
 310     ENC_CODERANGE_SET(dest, ENC_CODERANGE(src));
 311 }
 312
 313 int
 314 rb_enc_str_coderange(VALUE str)
 315 {
 316     int cr = ENC_CODERANGE(str);
 317
 318     if (cr == ENC_CODERANGE_UNKNOWN) {
 319         rb_encoding *enc = STR_ENC_GET(str);
 320         cr = coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
 321         ENC_CODERANGE_SET(str, cr);
 322     }
 323     return cr;
 324 }
 325
 326 int
 327 rb_enc_str_asciionly_p(VALUE str)
 328 {
 329     rb_encoding *enc = STR_ENC_GET(str);
 330
 331     if (!rb_enc_asciicompat(enc))
 332         return Qfalse;
 333     else if (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT)
 334         return Qtrue;
 335     return Qfalse;
 336 }
 337
 338 static inline void
 339 str_mod_check(VALUE s, const char *p, long len)
 340 {
 341     if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
 342         rb_raise(rb_eRuntimeError, "string modified");
 343     }
 344 }
 345
 346 static inline void
 347 str_frozen_check(VALUE s)
 348 {
 349     if (OBJ_FROZEN(s)) {
 350         rb_raise(rb_eRuntimeError, "string frozen");
 351     }
 352 }
 353
 354 size_t
 355 rb_str_capacity(VALUE str)
 356 {
 357     if (STR_EMBED_P(str)) {
 358         return RSTRING_EMBED_LEN_MAX;
 359     }
 360     else if (STR_NOCAPA_P(str)) {
 361         return RSTRING(str)->as.heap.len;
 362     }
 363     else {
 364         return RSTRING(str)->as.heap.aux.capa;
 365     }
 366 }
 367
 368 static inline VALUE
 369 str_alloc(VALUE klass)
 370 {
 371     NEWOBJ(str, struct RString);
 372     OBJSETUP(str, klass, T_STRING);
 373
 374     str->as.heap.ptr = 0;
 375     str->as.heap.len = 0;
 376     str->as.heap.aux.capa = 0;
 377
 378     return (VALUE)str;
 379 }
 380
 381 static VALUE
 382 str_new(VALUE klass, const char *ptr, long len)
 383 {
 384     VALUE str;
 385
 386     if (len < 0) {
 387         rb_raise(rb_eArgError, "negative string size (or size too big)");
 388     }
 389
 390     str = str_alloc(klass);
 391     if (len > RSTRING_EMBED_LEN_MAX) {
 392         RSTRING(str)->as.heap.aux.capa = len;
 393         RSTRING(str)->as.heap.ptr = ALLOC_N(char,len+1);
 394         STR_SET_NOEMBED(str);
 395     }
 396     if (ptr) {
 397         memcpy(RSTRING_PTR(str), ptr, len);
 398     }
 399     STR_SET_LEN(str, len);
 400     RSTRING_PTR(str)[len] = '\0';
 401     return str;
 402 }
 403
 404 VALUE
 405 rb_str_new(const char *ptr, long len)
 406 {
 407     return str_new(rb_cString, ptr, len);
 408 }
 409
 410 VALUE
 411 rb_usascii_str_new(const char *ptr, long len)
 412 {
 413     VALUE str = rb_str_new(ptr, len);
 414     ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
 415     return str;
 416 }
 417
 418 VALUE
 419 rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
 420 {
 421     VALUE str = rb_str_new(ptr, len);
 422     rb_enc_associate(str, enc);
 423     return str;
 424 }
 425
 426 VALUE
 427 rb_str_new_cstr(const char *ptr)
 428 {
 429     if (!ptr) {
 430         rb_raise(rb_eArgError, "NULL pointer given");
 431     }
 432     return rb_str_new(ptr, strlen(ptr));
 433 }
 434
 435 RUBY_ALIAS_FUNCTION(rb_str_new2(const char *ptr), rb_str_new_cstr, (ptr))
 436 #define rb_str_new2 rb_str_new_cstr
 437
 438 VALUE
 439 rb_usascii_str_new_cstr(const char *ptr)
 440 {
 441     VALUE str = rb_str_new2(ptr);
 442     ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
 443     return str;
 444 }
 445
 446 RUBY_ALIAS_FUNCTION(rb_usascii_str_new2(const char *ptr), rb_usascii_str_new_cstr, (ptr))
 447 #define rb_usascii_str_new2 rb_usascii_str_new_cstr
 448
 449 VALUE
 450 rb_tainted_str_new(const char *ptr, long len)
 451 {
 452     VALUE str = rb_str_new(ptr, len);
 453
 454     OBJ_TAINT(str);
 455     return str;
 456 }
 457
 458 VALUE
 459 rb_tainted_str_new_cstr(const char *ptr)
 460 {
 461     VALUE str = rb_str_new2(ptr);
 462
 463     OBJ_TAINT(str);
 464     return str;
 465 }
 466
 467 RUBY_ALIAS_FUNCTION(rb_tainted_str_new2(const char *ptr), rb_tainted_str_new_cstr, (ptr))
 468 #define rb_tainted_str_new2 rb_tainted_str_new_cstr
 469
 470 static VALUE
 471 str_replace_shared(VALUE str2, VALUE str)
 472 {
 473     if (RSTRING_LEN(str) <= RSTRING_EMBED_LEN_MAX) {
 474         STR_SET_EMBED(str2);
 475         memcpy(RSTRING_PTR(str2), RSTRING_PTR(str), RSTRING_LEN(str)+1);
 476         STR_SET_EMBED_LEN(str2, RSTRING_LEN(str));
 477     }
 478     else {
 479         FL_SET(str2, STR_NOEMBED);
 480         RSTRING(str2)->as.heap.len = RSTRING_LEN(str);
 481         RSTRING(str2)->as.heap.ptr = RSTRING_PTR(str);
 482         RSTRING(str2)->as.heap.aux.shared = str;
 483         FL_SET(str2, ELTS_SHARED);
 484     }
 485     rb_enc_cr_str_exact_copy(str2, str);
 486
 487     return str2;
 488 }
 489
 490 static VALUE
 491 str_new_shared(VALUE klass, VALUE str)
 492 {
 493     return str_replace_shared(str_alloc(klass), str);
 494 }
 495
 496 static VALUE
 497 str_new3(VALUE klass, VALUE str)
 498 {
 499     return str_new_shared(klass, str);
 500 }
 501
 502 VALUE
 503 rb_str_new_shared(VALUE str)
 504 {
 505     VALUE str2 = str_new3(rb_obj_class(str), str);
 506
 507     OBJ_INFECT(str2, str);
 508     return str2;
 509 }
 510
 511 RUBY_ALIAS_FUNCTION(rb_str_new3(VALUE str), rb_str_new_shared, (str))
 512 #define rb_str_new3 rb_str_new_shared
 513
 514 static VALUE
 515 str_new4(VALUE klass, VALUE str)
 516 {
 517     VALUE str2;
 518
 519     str2 = str_alloc(klass);
 520     STR_SET_NOEMBED(str2);
 521     RSTRING(str2)->as.heap.len = RSTRING_LEN(str);
 522     RSTRING(str2)->as.heap.ptr = RSTRING_PTR(str);
 523     if (STR_SHARED_P(str)) {
 524         FL_SET(str2, ELTS_SHARED);
 525         RSTRING(str2)->as.heap.aux.shared = RSTRING(str)->as.heap.aux.shared;
 526     }
 527     else {
 528         FL_SET(str, ELTS_SHARED);
 529         RSTRING(str)->as.heap.aux.shared = str2;
 530     }
 531     rb_enc_cr_str_exact_copy(str2, str);
 532     OBJ_INFECT(str2, str);
 533     return str2;
 534 }
 535
 536 VALUE
 537 rb_str_new_frozen(VALUE orig)
 538 {
 539     VALUE klass, str;
 540
 541     if (OBJ_FROZEN(orig)) return orig;
 542     klass = rb_obj_class(orig);
 543     if (STR_SHARED_P(orig) && (str = RSTRING(orig)->as.heap.aux.shared)) {
 544         long ofs;
 545         ofs = RSTRING_LEN(str) - RSTRING_LEN(orig);
 546         if ((ofs > 0) || (klass != RBASIC(str)->klass) ||
 547             (!OBJ_TAINTED(str) && OBJ_TAINTED(orig))) {
 548             str = str_new3(klass, str);
 549             RSTRING(str)->as.heap.ptr += ofs;
 550             RSTRING(str)->as.heap.len -= ofs;
 551         }
 552         rb_enc_cr_str_exact_copy(str, orig);
 553         OBJ_INFECT(str, orig);
 554     }
 555     else if (STR_EMBED_P(orig)) {
 556         str = str_new(klass, RSTRING_PTR(orig), RSTRING_LEN(orig));
 557         rb_enc_cr_str_exact_copy(str, orig);
 558         OBJ_INFECT(str, orig);
 559     }
 560     else if (STR_ASSOC_P(orig)) {
 561         VALUE assoc = RSTRING(orig)->as.heap.aux.shared;
 562         FL_UNSET(orig, STR_ASSOC);
 563         str = str_new4(klass, orig);
 564         FL_SET(str, STR_ASSOC);
 565         RSTRING(str)->as.heap.aux.shared = assoc;
 566     }
 567     else {
 568         str = str_new4(klass, orig);
 569     }
 570     OBJ_FREEZE(str);
 571     return str;
 572 }
 573
 574 RUBY_ALIAS_FUNCTION(rb_str_new4(VALUE orig), rb_str_new_frozen, (orig))
 575 #define rb_str_new4 rb_str_new_frozen
 576
 577 VALUE
 578 rb_str_new_with_class(VALUE obj, const char *ptr, long len)
 579 {
 580     return str_new(rb_obj_class(obj), ptr, len);
 581 }
 582
 583 RUBY_ALIAS_FUNCTION(rb_str_new5(VALUE obj, const char *ptr, long len),
 584            rb_str_new_with_class, (obj, ptr, len))
 585 #define rb_str_new5 rb_str_new_with_class
 586
 587 #define STR_BUF_MIN_SIZE 128
 588
 589 VALUE
 590 rb_str_buf_new(long capa)
 591 {
 592     VALUE str = str_alloc(rb_cString);
 593
 594     if (capa < STR_BUF_MIN_SIZE) {
 595         capa = STR_BUF_MIN_SIZE;
 596     }
 597     FL_SET(str, STR_NOEMBED);
 598     RSTRING(str)->as.heap.aux.capa = capa;
 599     RSTRING(str)->as.heap.ptr = ALLOC_N(char, capa+1);
 600     RSTRING(str)->as.heap.ptr[0] = '\0';
 601
 602     return str;
 603 }
 604
 605 VALUE
 606 rb_str_buf_new_cstr(const char *ptr)
 607 {
 608     VALUE str;
 609     long len = strlen(ptr);
 610
 611     str = rb_str_buf_new(len);
 612     rb_str_buf_cat(str, ptr, len);
 613
 614     return str;
 615 }
 616
 617 RUBY_ALIAS_FUNCTION(rb_str_buf_new2(const char *ptr), rb_str_buf_new_cstr, (ptr))
 618 #define rb_str_buf_new2 rb_str_buf_new_cstr
 619
 620 VALUE
 621 rb_str_tmp_new(long len)
 622 {
 623     return str_new(0, 0, len);
 624 }
 625
 626 void
 627 rb_str_free(VALUE str)
 628 {
 629     if (!STR_EMBED_P(str) && !STR_SHARED_P(str)) {
 630         xfree(RSTRING(str)->as.heap.ptr);
 631     }
 632 }
 633
 634 VALUE
 635 rb_str_to_str(VALUE str)
 636 {
 637     return rb_convert_type(str, T_STRING, "String", "to_str");
 638 }
 639
 640 void
 641 rb_str_shared_replace(VALUE str, VALUE str2)
 642 {
 643     rb_encoding *enc;
 644     int cr;
 645     if (str == str2) return;
 646     enc = STR_ENC_GET(str2);
 647     cr = ENC_CODERANGE(str2);
 648     rb_str_modify(str);
 649     OBJ_INFECT(str, str2);
 650     if (!STR_SHARED_P(str) && !STR_EMBED_P(str)) {
 651         xfree(RSTRING_PTR(str));
 652     }
 653     if (RSTRING_LEN(str2) <= RSTRING_EMBED_LEN_MAX) {
 654         STR_SET_EMBED(str);
 655         memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), RSTRING_LEN(str2)+1);
 656         STR_SET_EMBED_LEN(str, RSTRING_LEN(str2));
 657         rb_enc_associate(str, enc);
 658         ENC_CODERANGE_SET(str, cr);
 659         return;
 660     }
 661     STR_SET_NOEMBED(str);
 662     STR_UNSET_NOCAPA(str);
 663     RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
 664     RSTRING(str)->as.heap.len = RSTRING_LEN(str2);
 665     if (STR_NOCAPA_P(str2)) {
 666         FL_SET(str, RBASIC(str2)->flags & STR_NOCAPA);
 667         RSTRING(str)->as.heap.aux.shared = RSTRING(str2)->as.heap.aux.shared;
 668     }
 669     else {
 670         RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
 671     }
 672     RSTRING(str2)->as.heap.ptr = 0;     /* abandon str2 */
 673     RSTRING(str2)->as.heap.len = 0;
 674     RSTRING(str2)->as.heap.aux.capa = 0;
 675     STR_UNSET_NOCAPA(str2);
 676     rb_enc_associate(str, enc);
 677     ENC_CODERANGE_SET(str, cr);
 678 }
 679
 680 static ID id_to_s;
 681
 682 VALUE
 683 rb_obj_as_string(VALUE obj)
 684 {
 685     VALUE str;
 686
 687     if (TYPE(obj) == T_STRING) {
 688         return obj;
 689     }
 690     str = rb_funcall(obj, id_to_s, 0);
 691     if (TYPE(str) != T_STRING)
 692         return rb_any_to_s(obj);
 693     if (OBJ_TAINTED(obj)) OBJ_TAINT(str);
 694     return str;
 695 }
 696
 697 static VALUE rb_str_replace(VALUE, VALUE);
 698
 699 VALUE
 700 rb_str_dup(VALUE str)
 701 {
 702     VALUE dup = str_alloc(rb_obj_class(str));
 703     rb_str_replace(dup, str);
 704     return dup;
 705 }
 706
 707
 708 /*
 709  *  call-seq:
 710  *     String.new(str="")   => new_str
 711  *
 712  *  Returns a new string object containing a copy of <i>str</i>.
 713  */
 714
 715 static VALUE
 716 rb_str_init(int argc, VALUE *argv, VALUE str)
 717 {
 718     VALUE orig;
 719
 720     if (argc > 0 && rb_scan_args(argc, argv, "01", &orig) == 1)
 721         rb_str_replace(str, orig);
 722     return str;
 723 }
 724
 725 long
 726 rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
 727 {
 728     long c;
 729     const char *q;
 730
 731     if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
 732         return (e - p + rb_enc_mbminlen(enc) - 1) / rb_enc_mbminlen(enc);
 733     }
 734     else if (rb_enc_asciicompat(enc)) {
 735         c = 0;
 736         while (p < e) {
 737             if (ISASCII(*p)) {
 738                 q = search_nonascii(p, e);
 739                 if (!q)
 740                     return c + (e - p);
 741                 c += q - p;
 742                 p = q;
 743             }
 744             p += rb_enc_mbclen(p, e, enc);
 745             c++;
 746         }
 747         return c;
 748     }
 749
 750     for (c=0; p<e; c++) {
 751         p += rb_enc_mbclen(p, e, enc);
 752     }
 753     return c;
 754 }
 755
 756 long
 757 rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
 758 {
 759     long c;
 760     const char *q;
 761     int ret;
 762
 763     *cr = 0;
 764     if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
 765         return (e - p + rb_enc_mbminlen(enc) - 1) / rb_enc_mbminlen(enc);
 766     }
 767     else if (rb_enc_asciicompat(enc)) {
 768         c = 0;
 769         while (p < e) {
 770             if (ISASCII(*p)) {
 771                 q = search_nonascii(p, e);
 772                 if (!q) {
 773                     if (!*cr) *cr = ENC_CODERANGE_7BIT;
 774                     return c + (e - p);
 775                 }
 776                 c += q - p;
 777                 p = q;
 778             }
 779             ret = rb_enc_precise_mbclen(p, e, enc);
 780             if (MBCLEN_CHARFOUND_P(ret)) {
 781                 *cr |= ENC_CODERANGE_VALID;
 782                 p += MBCLEN_CHARFOUND_LEN(ret);
 783             }
 784             else {
 785                 *cr = ENC_CODERANGE_BROKEN;
 786                 p++;
 787             }
 788             c++;
 789         }
 790         if (!*cr) *cr = ENC_CODERANGE_7BIT;
 791         return c;
 792     }
 793
 794     for (c=0; p<e; c++) {
 795         ret = rb_enc_precise_mbclen(p, e, enc);
 796         if (MBCLEN_CHARFOUND_P(ret)) {
 797             *cr |= ENC_CODERANGE_VALID;
 798             p += MBCLEN_CHARFOUND_LEN(ret);
 799         }
 800         else {
 801             *cr = ENC_CODERANGE_BROKEN;
 802             p++;
 803         }
 804     }
 805     if (!*cr) *cr = ENC_CODERANGE_7BIT;
 806     return c;
 807 }
 808
 809 #ifdef NONASCII_MASK
 810 #define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
 811 static inline VALUE
 812 count_utf8_lead_bytes_with_word(const VALUE *s)
 813 {
 814     VALUE d = *s;
 815     d |= ~(d>>1);
 816     d >>= 6;
 817     d &= NONASCII_MASK >> 7;
 818     d += (d>>8);
 819     d += (d>>16);
 820 #if SIZEOF_VALUE == 8
 821     d += (d>>32);
 822 #endif
 823     return (d&0xF);
 824 }
 825 #endif
 826
 827 static long
 828 str_strlen(VALUE str, rb_encoding *enc)
 829 {
 830     const char *p, *e;
 831     int n, cr;
 832
 833     if (single_byte_optimizable(str)) return RSTRING_LEN(str);
 834     if (!enc) enc = STR_ENC_GET(str);
 835     p = RSTRING_PTR(str);
 836     e = RSTRING_END(str);
 837 #ifdef NONASCII_MASK
 838     if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
 839         enc == rb_utf8_encoding()) {
 840         VALUE len = 0;
 841         if (sizeof(VALUE) * 2 < e - p) {
 842             const VALUE *s, *t;
 843             const VALUE lowbits = sizeof(VALUE) - 1;
 844             s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
 845             t = (const VALUE*)(~lowbits & (VALUE)e);
 846             while (p < (const char *)s) {
 847                 if (is_utf8_lead_byte(*p)) len++;
 848                 p++;
 849             }
 850             while (s < t) {
 851                 len += count_utf8_lead_bytes_with_word(s);
 852                 s++;
 853             }
 854             p = (const char *)s;
 855         }
 856         while (p < e) {
 857             if (is_utf8_lead_byte(*p)) len++;
 858             p++;
 859         }
 860         return (long)len;
 861     }
 862 #endif
 863     n = rb_enc_strlen_cr(p, e, enc, &cr);
 864     if (cr) {
 865         ENC_CODERANGE_SET(str, cr);
 866     }
 867     return n;
 868 }
 869
 870 /*
 871  *  call-seq:
 872  *     str.length   => integer
 873  *     str.size     => integer
 874  *
 875  *  Returns the character length of <i>str</i>.
 876  */
 877
 878 VALUE
 879 rb_str_length(VALUE str)
 880 {
 881     int len;
 882
 883     len = str_strlen(str, STR_ENC_GET(str));
 884     return INT2NUM(len);
 885 }
 886
 887 /*
 888  *  call-seq:
 889  *     str.bytesize  => integer
 890  *
 891  *  Returns the length of <i>str</i> in bytes.
 892  */
 893
 894 static VALUE
 895 rb_str_bytesize(VALUE str)
 896 {
 897     return INT2NUM(RSTRING_LEN(str));
 898 }
 899
 900 /*
 901  *  call-seq:
 902  *     str.empty?   => true or false
 903  *
 904  *  Returns <code>true</code> if <i>str</i> has a length of zero.
 905  *
 906  *     "hello".empty?   #=> false
 907  *     "".empty?        #=> true
 908  */
 909
 910 static VALUE
 911 rb_str_empty(VALUE str)
 912 {
 913     if (RSTRING_LEN(str) == 0)
 914         return Qtrue;
 915     return Qfalse;
 916 }
 917
 918 /*
 919  *  call-seq:
 920  *     str + other_str   => new_str
 921  *
 922  *  Concatenation---Returns a new <code>String</code> containing
 923  *  <i>other_str</i> concatenated to <i>str</i>.
 924  *
 925  *     "Hello from " + self.to_s   #=> "Hello from main"
 926  */
 927
 928 VALUE
 929 rb_str_plus(VALUE str1, VALUE str2)
 930 {
 931     VALUE str3;
 932     rb_encoding *enc;
 933
 934     StringValue(str2);
 935     enc = rb_enc_check(str1, str2);
 936     str3 = rb_str_new(0, RSTRING_LEN(str1)+RSTRING_LEN(str2));
 937     memcpy(RSTRING_PTR(str3), RSTRING_PTR(str1), RSTRING_LEN(str1));
 938     memcpy(RSTRING_PTR(str3) + RSTRING_LEN(str1),
 939            RSTRING_PTR(str2), RSTRING_LEN(str2));
 940     RSTRING_PTR(str3)[RSTRING_LEN(str3)] = '\0';
 941
 942     if (OBJ_TAINTED(str1) || OBJ_TAINTED(str2))
 943         OBJ_TAINT(str3);
 944     ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc),
 945                            ENC_CODERANGE_AND(ENC_CODERANGE(str1), ENC_CODERANGE(str2)));
 946     return str3;
 947 }
 948
 949 /*
 950  *  call-seq:
 951  *     str * integer   => new_str
 952  *
 953  *  Copy---Returns a new <code>String</code> containing <i>integer</i> copies of
 954  *  the receiver.
 955  *
 956  *     "Ho! " * 3   #=> "Ho! Ho! Ho! "
 957  */
 958
 959 VALUE
 960 rb_str_times(VALUE str, VALUE times)
 961 {
 962     VALUE str2;
 963     long n, len;
 964
 965     len = NUM2LONG(times);
 966     if (len < 0) {
 967         rb_raise(rb_eArgError, "negative argument");
 968     }
 969     if (len && LONG_MAX/len <  RSTRING_LEN(str)) {
 970         rb_raise(rb_eArgError, "argument too big");
 971     }
 972
 973     str2 = rb_str_new5(str, 0, len *= RSTRING_LEN(str));
 974     if (len) {
 975         n = RSTRING_LEN(str);
 976         memcpy(RSTRING_PTR(str2), RSTRING_PTR(str), n);
 977         while (n <= len/2) {
 978             memcpy(RSTRING_PTR(str2) + n, RSTRING_PTR(str2), n);
 979             n *= 2;
 980         }
 981         memcpy(RSTRING_PTR(str2) + n, RSTRING_PTR(str2), len-n);
 982     }
 983     RSTRING_PTR(str2)[RSTRING_LEN(str2)] = '\0';
 984     OBJ_INFECT(str2, str);
 985     rb_enc_cr_str_copy_for_substr(str2, str);
 986
 987     return str2;
 988 }
 989
 990 /*
 991  *  call-seq:
 992  *     str % arg   => new_str
 993  *
 994  *  Format---Uses <i>str</i> as a format specification, and returns the result
 995  *  of applying it to <i>arg</i>. If the format specification contains more than
 996  *  one substitution, then <i>arg</i> must be an <code>Array</code> containing
 997  *  the values to be substituted. See <code>Kernel::sprintf</code> for details
 998  *  of the format string.
 999  *
1000  *     "%05d" % 123                              #=> "00123"
1001  *     "%-5s: %08x" % [ "ID", self.object_id ]   #=> "ID   : 200e14d6"
1002  */
1003
1004 static VALUE
1005 rb_str_format_m(VALUE str, VALUE arg)
1006 {
1007     volatile VALUE tmp = rb_check_array_type(arg);
1008
1009     if (!NIL_P(tmp)) {
1010         return rb_str_format(RARRAY_LEN(tmp), RARRAY_PTR(tmp), str);
1011     }
1012     return rb_str_format(1, &arg, str);
1013 }
1014
1015 static inline void
1016 str_modifiable(VALUE str)
1017 {
1018     if (FL_TEST(str, STR_TMPLOCK)) {
1019         rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
1020     }
1021     if (OBJ_FROZEN(str)) rb_error_frozen("string");
1022     if (!OBJ_UNTRUSTED(str) && rb_safe_level() >= 4)
1023         rb_raise(rb_eSecurityError, "Insecure: can't modify string");
1024 }
1025
1026 static inline int
1027 str_independent(VALUE str)
1028 {
1029     str_modifiable(str);
1030     if (!STR_SHARED_P(str)) return 1;
1031     if (STR_EMBED_P(str)) return 1;
1032     return 0;
1033 }
1034
1035 static void
1036 str_make_independent(VALUE str)
1037 {
1038     char *ptr;
1039     long len = RSTRING_LEN(str);
1040
1041     ptr = ALLOC_N(char, len+1);
1042     if (RSTRING_PTR(str)) {
1043         memcpy(ptr, RSTRING_PTR(str), len);
1044     }
1045     STR_SET_NOEMBED(str);
1046     ptr[len] = 0;
1047     RSTRING(str)->as.heap.ptr = ptr;
1048     RSTRING(str)->as.heap.len = len;
1049     RSTRING(str)->as.heap.aux.capa = len;
1050     STR_UNSET_NOCAPA(str);
1051 }
1052
1053 void
1054 rb_str_modify(VALUE str)
1055 {
1056     if (!str_independent(str))
1057         str_make_independent(str);
1058     ENC_CODERANGE_CLEAR(str);
1059 }
1060
1061 void
1062 rb_str_associate(VALUE str, VALUE add)
1063 {
1064     /* sanity check */
1065     if (OBJ_FROZEN(str)) rb_error_frozen("string");
1066     if (STR_ASSOC_P(str)) {
1067         /* already associated */
1068         rb_ary_concat(RSTRING(str)->as.heap.aux.shared, add);
1069     }
1070     else {
1071         if (STR_SHARED_P(str)) {
1072             VALUE assoc = RSTRING(str)->as.heap.aux.shared;
1073             str_make_independent(str);
1074             if (STR_ASSOC_P(assoc)) {
1075                 assoc = RSTRING(assoc)->as.heap.aux.shared;
1076                 rb_ary_concat(assoc, add);
1077                 add = assoc;
1078             }
1079         }
1080         else if (STR_EMBED_P(str)) {
1081             str_make_independent(str);
1082         }
1083         else if (RSTRING(str)->as.heap.aux.capa != RSTRING_LEN(str)) {
1084             RESIZE_CAPA(str, RSTRING_LEN(str));
1085         }
1086         FL_SET(str, STR_ASSOC);
1087         RBASIC(add)->klass = 0;
1088         RSTRING(str)->as.heap.aux.shared = add;
1089     }
1090 }
1091
1092 VALUE
1093 rb_str_associated(VALUE str)
1094 {
1095     if (STR_SHARED_P(str)) str = RSTRING(str)->as.heap.aux.shared;
1096     if (STR_ASSOC_P(str)) {
1097         return RSTRING(str)->as.heap.aux.shared;
1098     }
1099     return Qfalse;
1100 }
1101
1102 VALUE
1103 rb_string_value(volatile VALUE *ptr)
1104 {
1105     VALUE s = *ptr;
1106     if (TYPE(s) != T_STRING) {
1107         s = rb_str_to_str(s);
1108         *ptr = s;
1109     }
1110     return s;
1111 }
1112
1113 char *
1114 rb_string_value_ptr(volatile VALUE *ptr)
1115 {
1116     return RSTRING_PTR(rb_string_value(ptr));
1117 }
1118
1119 char *
1120 rb_string_value_cstr(volatile VALUE *ptr)
1121 {
1122     VALUE str = rb_string_value(ptr);
1123     char *s = RSTRING_PTR(str);
1124
1125     if (!s || RSTRING_LEN(str) != strlen(s)) {
1126         rb_raise(rb_eArgError, "string contains null byte");
1127     }
1128     return s;
1129 }
1130
1131 VALUE
1132 rb_check_string_type(VALUE str)
1133 {
1134     str = rb_check_convert_type(str, T_STRING, "String", "to_str");
1135     return str;
1136 }
1137
1138 /*
1139  *  call-seq:
1140  *     String.try_convert(obj) -> string or nil
1141  *
1142  *  Try to convert <i>obj</i> into a String, using to_str method.
1143  *  Returns converted regexp or nil if <i>obj</i> cannot be converted
1144  *  for any reason.
1145  *
1146  *     String.try_convert("str")     # => str
1147  *     String.try_convert(/re/)      # => nil
1148  */
1149 static VALUE
1150 rb_str_s_try_convert(VALUE dummy, VALUE str)
1151 {
1152     return rb_check_string_type(str);
1153 }
1154
1155 char*
1156 rb_enc_nth(const char *p, const char *e, int nth, rb_encoding *enc)
1157 {
1158     if (rb_enc_mbmaxlen(enc) == 1) {
1159         p += nth;
1160     }
1161     else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
1162         p += nth * rb_enc_mbmaxlen(enc);
1163     }
1164     else if (rb_enc_asciicompat(enc)) {
1165         const char *p2, *e2;
1166         int n;
1167
1168         while (p < e && 0 < nth) {
1169             e2 = p + nth;
1170             if (e < e2)
1171                 return (char *)e;
1172             if (ISASCII(*p)) {
1173                 p2 = search_nonascii(p, e2);
1174                 if (!p2)
1175                     return (char *)e2;
1176                 nth -= p2 - p;
1177                 p = p2;
1178             }
1179             n = rb_enc_mbclen(p, e, enc);
1180             p += n;
1181             nth--;
1182         }
1183         if (nth != 0)
1184             return (char *)e;
1185         return (char *)p;
1186     }
1187     else {
1188         while (p<e && nth--) {
1189             p += rb_enc_mbclen(p, e, enc);
1190         }
1191     }
1192     if (p > e) p = e;
1193     return (char*)p;
1194 }
1195
1196 static char*
1197 str_nth(const char *p, const char *e, int nth, rb_encoding *enc, int singlebyte)
1198 {
1199     if (singlebyte)
1200         p += nth;
1201     else {
1202         p = rb_enc_nth(p, e, nth, enc);
1203     }
1204     if (!p) return 0;
1205     if (p > e) p = e;
1206     return (char *)p;
1207 }
1208
1209 /* char offset to byte offset */
1210 static int
1211 str_offset(const char *p, const char *e, int nth, rb_encoding *enc, int singlebyte)
1212 {
1213     const char *pp = str_nth(p, e, nth, enc, singlebyte);
1214     if (!pp) return e - p;
1215     return pp - p;
1216 }
1217
1218 #ifdef NONASCII_MASK
1219 static char *
1220 str_utf8_nth(const char *p, const char *e, int nth)
1221 {
1222     if (sizeof(VALUE) * 2 < nth) {
1223         const VALUE *s, *t;
1224         const VALUE lowbits = sizeof(VALUE) - 1;
1225         s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
1226         t = (const VALUE*)(~lowbits & (VALUE)e);
1227         while (p < (const char *)s) {
1228             if (is_utf8_lead_byte(*p)) nth--;
1229             p++;
1230         }
1231         do {
1232             nth -= count_utf8_lead_bytes_with_word(s);
1233             s++;
1234         } while (s < t && sizeof(VALUE) <= nth);
1235         p = (char *)s;
1236     }
1237     while (p < e) {
1238         if (is_utf8_lead_byte(*p)) {
1239             if (nth == 0) break;
1240             nth--;
1241         }
1242         p++;
1243     }
1244     return (char *)p;
1245 }
1246
1247 static int
1248 str_utf8_offset(const char *p, const char *e, int nth)
1249 {
1250     const char *pp = str_utf8_nth(p, e, nth);
1251     if (!pp) return e - p;
1252     return pp - p;
1253 }
1254 #endif
1255
1256 /* byte offset to char offset */
1257 long
1258 rb_str_sublen(VALUE str, long pos)
1259 {
1260     if (single_byte_optimizable(str) || pos < 0)
1261         return pos;
1262     else {
1263         char *p = RSTRING_PTR(str);
1264         return rb_enc_strlen(p, p + pos, STR_ENC_GET(str));
1265     }
1266 }
1267
1268 VALUE
1269 rb_str_subseq(VALUE str, long beg, long len)
1270 {
1271     VALUE str2 = rb_str_new5(str, RSTRING_PTR(str)+beg, len);
1272
1273     rb_enc_cr_str_copy_for_substr(str2, str);
1274     OBJ_INFECT(str2, str);
1275
1276     return str2;
1277 }
1278
1279 VALUE
1280 rb_str_substr(VALUE str, long beg, long len)
1281 {
1282     rb_encoding *enc = STR_ENC_GET(str);
1283     VALUE str2;
1284     char *p, *s = RSTRING_PTR(str), *e = s + RSTRING_LEN(str);
1285     int singlebyte;
1286
1287     if (len < 0) return Qnil;
1288     if (!RSTRING_LEN(str)) {
1289         len = 0;
1290     }
1291     if (beg < 0) {
1292         if (len > -beg) len = -beg;
1293         if (-beg * rb_enc_mbmaxlen(enc) < RSTRING_LEN(str) / 8) {
1294             beg = -beg;
1295             while (beg-- > len && (e = rb_enc_prev_char(s, e, enc)) != 0);
1296             p = e;
1297             if (!p) return Qnil;
1298             while (len-- > 0 && (p = rb_enc_prev_char(s, p, enc)) != 0);
1299             if (!p) return Qnil;
1300             len = e - p;
1301             goto sub;
1302         }
1303         else {
1304             beg += str_strlen(str, enc);
1305             if (beg < 0) return Qnil;
1306         }
1307     }
1308     else if (beg > 0 && beg > str_strlen(str, enc)) {
1309         return Qnil;
1310     }
1311     singlebyte = single_byte_optimizable(str);
1312     if (len == 0) {
1313         p = 0;
1314     }
1315 #ifdef NONASCII_MASK
1316     else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
1317         enc == rb_utf8_encoding()) {
1318         p = str_utf8_nth(s, e, beg);
1319         len = str_utf8_offset(p, e, len);
1320     }
1321 #endif
1322     else if ((p = str_nth(s, e, beg, enc, singlebyte)) == e) {
1323         len = 0;
1324     }
1325     else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
1326         if (len * rb_enc_mbmaxlen(enc) > e - p)
1327             len = e - p;
1328         else
1329             len *= rb_enc_mbmaxlen(enc);
1330     }
1331     else {
1332         len = str_offset(p, e, len, enc, singlebyte);
1333     }
1334   sub:
1335     if (len > RSTRING_EMBED_LEN_MAX && beg + len == RSTRING_LEN(str)) {
1336         str2 = rb_str_new4(str);
1337         str2 = str_new3(rb_obj_class(str2), str2);
1338         RSTRING(str2)->as.heap.ptr += RSTRING(str2)->as.heap.len - len;
1339         RSTRING(str2)->as.heap.len = len;
1340     }
1341     else {
1342         str2 = rb_str_new5(str, p, len);
1343         rb_enc_cr_str_copy_for_substr(str2, str);
1344         OBJ_INFECT(str2, str);
1345     }
1346
1347     return str2;
1348 }
1349
1350 VALUE
1351 rb_str_freeze(VALUE str)
1352 {
1353     if (STR_ASSOC_P(str)) {
1354         VALUE ary = RSTRING(str)->as.heap.aux.shared;
1355         OBJ_FREEZE(ary);
1356     }
1357     return rb_obj_freeze(str);
1358 }
1359
1360 RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str))
1361 #define rb_str_dup_frozen rb_str_new_frozen
1362
1363 VALUE
1364 rb_str_locktmp(VALUE str)
1365 {
1366     if (FL_TEST(str, STR_TMPLOCK)) {
1367         rb_raise(rb_eRuntimeError, "temporal locking already locked string");
1368     }
1369     FL_SET(str, STR_TMPLOCK);
1370     return str;
1371 }
1372
1373 VALUE
1374 rb_str_unlocktmp(VALUE str)
1375 {
1376     if (!FL_TEST(str, STR_TMPLOCK)) {
1377         rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
1378     }
1379     FL_UNSET(str, STR_TMPLOCK);
1380     return str;
1381 }
1382
1383 void
1384 rb_str_set_len(VALUE str, long len)
1385 {
1386     STR_SET_LEN(str, len);
1387     RSTRING_PTR(str)[len] = '\0';
1388 }
1389
1390 VALUE
1391 rb_str_resize(VALUE str, long len)
1392 {
1393     long slen;
1394
1395     if (len < 0) {
1396         rb_raise(rb_eArgError, "negative string size (or size too big)");
1397     }
1398
1399     rb_str_modify(str);
1400     slen = RSTRING_LEN(str);
1401     if (len != slen) {
1402         if (STR_EMBED_P(str)) {
1403             char *ptr;
1404             if (len <= RSTRING_EMBED_LEN_MAX) {
1405                 STR_SET_EMBED_LEN(str, len);
1406                 RSTRING(str)->as.ary[len] = '\0';
1407                 return str;
1408             }
1409             ptr = ALLOC_N(char,len+1);
1410             MEMCPY(ptr, RSTRING(str)->as.ary, char, slen);
1411             RSTRING(str)->as.heap.ptr = ptr;
1412             STR_SET_NOEMBED(str);
1413         }
1414         else if (len <= RSTRING_EMBED_LEN_MAX) {
1415             char *ptr = RSTRING(str)->as.heap.ptr;
1416             STR_SET_EMBED(str);
1417             if (slen > 0) MEMCPY(RSTRING(str)->as.ary, ptr, char, len);
1418             RSTRING(str)->as.ary[len] = '\0';
1419             STR_SET_EMBED_LEN(str, len);
1420             xfree(ptr);
1421             return str;
1422         }
1423         else if (slen < len || slen - len > 1024) {
1424             REALLOC_N(RSTRING(str)->as.heap.ptr, char, len+1);
1425         }
1426         if (!STR_NOCAPA_P(str)) {
1427             RSTRING(str)->as.heap.aux.capa = len;
1428         }
1429         RSTRING(str)->as.heap.len = len;
1430         RSTRING(str)->as.heap.ptr[len] = '\0';  /* sentinel */
1431     }
1432     return str;
1433 }
1434
1435 static VALUE
1436 str_buf_cat(VALUE str, const char *ptr, long len)
1437 {
1438     long capa, total, off = -1;
1439
1440     if (ptr >= RSTRING_PTR(str) && ptr <= RSTRING_END(str)) {
1441         off = ptr - RSTRING_PTR(str);
1442     }
1443     rb_str_modify(str);
1444     if (len == 0) return 0;
1445     if (STR_ASSOC_P(str)) {
1446         FL_UNSET(str, STR_ASSOC);
1447         capa = RSTRING(str)->as.heap.aux.capa = RSTRING_LEN(str);
1448     }
1449     else if (STR_EMBED_P(str)) {
1450         capa = RSTRING_EMBED_LEN_MAX;
1451     }
1452     else {
1453         capa = RSTRING(str)->as.heap.aux.capa;
1454     }
1455     if (RSTRING_LEN(str) >= LONG_MAX - len) {
1456         rb_raise(rb_eArgError, "string sizes too big");
1457     }
1458     total = RSTRING_LEN(str)+len;
1459     if (capa <= total) {
1460         while (total > capa) {
1461             if (capa + 1 >= LONG_MAX / 2) {
1462                 capa = (total + 4095) / 4096;
1463                 break;
1464             }
1465             capa = (capa + 1) * 2;
1466         }
1467         RESIZE_CAPA(str, capa);
1468     }
1469     if (off != -1) {
1470         ptr = RSTRING_PTR(str) + off;
1471     }
1472     memcpy(RSTRING_PTR(str) + RSTRING_LEN(str), ptr, len);
1473     STR_SET_LEN(str, total);
1474     RSTRING_PTR(str)[total] = '\0'; /* sentinel */
1475
1476     return str;
1477 }
1478
1479 VALUE
1480 rb_str_buf_cat(VALUE str, const char *ptr, long len)
1481 {
1482     if (len == 0) return str;
1483     if (len < 0) {
1484         rb_raise(rb_eArgError, "negative string size (or size too big)");
1485     }
1486     return str_buf_cat(str, ptr, len);
1487 }
1488
1489 VALUE
1490 rb_str_buf_cat2(VALUE str, const char *ptr)
1491 {
1492     return rb_str_buf_cat(str, ptr, strlen(ptr));
1493 }
1494
1495 VALUE
1496 rb_str_cat(VALUE str, const char *ptr, long len)
1497 {
1498     if (len < 0) {
1499         rb_raise(rb_eArgError, "negative string size (or size too big)");
1500     }
1501     if (STR_ASSOC_P(str)) {
1502         rb_str_modify(str);
1503         if (STR_EMBED_P(str)) str_make_independent(str);
1504         REALLOC_N(RSTRING(str)->as.heap.ptr, char, RSTRING(str)->as.heap.len+len+1);
1505         memcpy(RSTRING(str)->as.heap.ptr + RSTRING(str)->as.heap.len, ptr, len);
1506         RSTRING(str)->as.heap.len += len;
1507         RSTRING(str)->as.heap.ptr[RSTRING(str)->as.heap.len] = '\0'; /* sentinel */
1508         return str;
1509     }
1510
1511     return rb_str_buf_cat(str, ptr, len);
1512 }
1513
1514 VALUE
1515 rb_str_cat2(VALUE str, const char *ptr)
1516 {
1517     return rb_str_cat(str, ptr, strlen(ptr));
1518 }
1519
1520 static VALUE
1521 rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
1522     int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
1523 {
1524     int str_encindex = ENCODING_GET(str);
1525     int res_encindex;
1526     int str_cr, res_cr;
1527     int str_a8 = ENCODING_IS_ASCII8BIT(str);
1528     int ptr_a8 = ptr_encindex == 0;
1529
1530     str_cr = ENC_CODERANGE(str);
1531
1532     if (str_encindex == ptr_encindex) {
1533         if (str_cr == ENC_CODERANGE_UNKNOWN ||
1534             (ptr_a8 && str_cr != ENC_CODERANGE_7BIT)) {
1535             ptr_cr = ENC_CODERANGE_UNKNOWN;
1536         }
1537         else if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
1538             ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
1539         }
1540     }
1541     else {
1542         rb_encoding *str_enc = rb_enc_from_index(str_encindex);
1543         rb_encoding *ptr_enc = rb_enc_from_index(ptr_encindex);
1544         if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
1545             if (len == 0)
1546                 return str;
1547             if (RSTRING_LEN(str) == 0) {
1548                 rb_str_buf_cat(str, ptr, len);
1549                 ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
1550                 return str;
1551             }
1552             goto incompatible;
1553         }
1554         if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
1555             ptr_cr = coderange_scan(ptr, len, ptr_enc);
1556         }
1557         if (str_cr == ENC_CODERANGE_UNKNOWN) {
1558             if (str_a8 || ptr_cr != ENC_CODERANGE_7BIT) {
1559                 str_cr = rb_enc_str_coderange(str);
1560             }
1561         }
1562     }
1563     if (ptr_cr_ret)
1564         *ptr_cr_ret = ptr_cr;
1565
1566     if (str_encindex != ptr_encindex &&
1567         str_cr != ENC_CODERANGE_7BIT &&
1568         ptr_cr != ENC_CODERANGE_7BIT) {
1569       incompatible:
1570         rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
1571             rb_enc_name(rb_enc_from_index(str_encindex)),
1572             rb_enc_name(rb_enc_from_index(ptr_encindex)));
1573     }
1574
1575     if (str_cr == ENC_CODERANGE_UNKNOWN) {
1576         res_encindex = str_encindex;
1577         res_cr = ENC_CODERANGE_UNKNOWN;
1578     }
1579     else if (str_cr == ENC_CODERANGE_7BIT) {
1580         if (ptr_cr == ENC_CODERANGE_7BIT) {
1581             res_encindex = !str_a8 ? str_encindex : ptr_encindex;
1582             res_cr = ENC_CODERANGE_7BIT;
1583         }
1584         else {
1585             res_encindex = ptr_encindex;
1586             res_cr = ptr_cr;
1587         }
1588     }
1589     else if (str_cr == ENC_CODERANGE_VALID) {
1590         res_encindex = str_encindex;
1591         res_cr = str_cr;
1592     }
1593     else { /* str_cr == ENC_CODERANGE_BROKEN */
1594         res_encindex = str_encindex;
1595         res_cr = str_cr;
1596         if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
1597     }
1598
1599     if (len < 0) {
1600         rb_raise(rb_eArgError, "negative string size (or size too big)");
1601     }
1602     str_buf_cat(str, ptr, len);
1603     ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
1604     return str;
1605 }
1606
1607 VALUE
1608 rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
1609 {
1610     return rb_enc_cr_str_buf_cat(str, ptr, len,
1611         rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL);
1612 }
1613
1614 VALUE
1615 rb_str_buf_cat_ascii(VALUE str, const char *ptr)
1616 {
1617     /* ptr must reference NUL terminated ASCII string. */
1618     int encindex = ENCODING_GET(str);
1619     rb_encoding *enc = rb_enc_from_index(encindex);
1620     if (rb_enc_asciicompat(enc)) {
1621         return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
1622             encindex, ENC_CODERANGE_7BIT, 0);
1623     }
1624     else {
1625         char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
1626         while (*ptr) {
1627             int c = (unsigned char)*ptr;
1628             int len = rb_enc_codelen(c, enc);
1629             rb_enc_mbcput(c, buf, enc);
1630             rb_enc_cr_str_buf_cat(str, buf, len,
1631                 encindex, ENC_CODERANGE_VALID, 0);
1632             ptr++;
1633         }
1634         return str;
1635     }
1636 }
1637
1638 VALUE
1639 rb_str_buf_append(VALUE str, VALUE str2)
1640 {
1641     int str2_cr;
1642
1643     str2_cr = ENC_CODERANGE(str2);
1644
1645     rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
1646         ENCODING_GET(str2), str2_cr, &str2_cr);
1647
1648     OBJ_INFECT(str, str2);
1649     ENC_CODERANGE_SET(str2, str2_cr);
1650
1651     return str;
1652 }
1653
1654 VALUE
1655 rb_str_append(VALUE str, VALUE str2)
1656 {
1657     rb_encoding *enc;
1658     int cr, cr2;
1659
1660     StringValue(str2);
1661     if (RSTRING_LEN(str2) > 0 && STR_ASSOC_P(str)) {
1662         long len = RSTRING_LEN(str)+RSTRING_LEN(str2);
1663         enc = rb_enc_check(str, str2);
1664         cr = ENC_CODERANGE(str);
1665         if ((cr2 = ENC_CODERANGE(str2)) > cr) cr = cr2;
1666         rb_str_modify(str);
1667         REALLOC_N(RSTRING(str)->as.heap.ptr, char, len+1);
1668         memcpy(RSTRING(str)->as.heap.ptr + RSTRING(str)->as.heap.len,
1669                RSTRING_PTR(str2), RSTRING_LEN(str2)+1);
1670         RSTRING(str)->as.heap.len = len;
1671         rb_enc_associate(str, enc);
1672         ENC_CODERANGE_SET(str, cr);
1673         OBJ_INFECT(str, str2);
1674         return str;
1675     }
1676     return rb_str_buf_append(str, str2);
1677 }
1678
1679
1680 /*
1681  *  call-seq:
1682  *     str << fixnum        => str
1683  *     str.concat(fixnum)   => str
1684  *     str << obj           => str
1685  *     str.concat(obj)      => str
1686  *
1687  *  Append---Concatenates the given object to <i>str</i>. If the object is a
1688  *  <code>Fixnum</code>, it is considered as a codepoint, and is converted
1689  *  to a character before concatenation.
1690  *
1691  *     a = "hello "
1692  *     a << "world"   #=> "hello world"
1693  *     a.concat(33)   #=> "hello world!"
1694  */
1695
1696 VALUE
1697 rb_str_concat(VALUE str1, VALUE str2)
1698 {
1699     if (FIXNUM_P(str2)) {
1700         rb_encoding *enc = STR_ENC_GET(str1);
1701         int c = FIX2INT(str2);
1702         int pos = RSTRING_LEN(str1);
1703         int len = rb_enc_codelen(c, enc);
1704         int cr = ENC_CODERANGE(str1);
1705
1706         rb_str_resize(str1, pos+len);
1707         rb_enc_mbcput(c, RSTRING_PTR(str1)+pos, enc);
1708         ENC_CODERANGE_SET(str1, cr);
1709         return str1;
1710     }
1711     return rb_str_append(str1, str2);
1712 }
1713
1714 #if defined __i386__ || defined _M_IX86
1715 #define UNALIGNED_WORD_ACCESS 1
1716 #endif
1717 #ifndef UNALIGNED_WORD_ACCESS
1718 #define UNALIGNED_WORD_ACCESS 0
1719 #endif
1720
1721 /* MurmurHash described in http://murmurhash.googlepages.com/ */
1722 static unsigned int
1723 hash(const unsigned char * data, int len, unsigned int h)
1724 {
1725     const unsigned int m = 0x7fd652ad;
1726     const int r = 16;
1727
1728     h += 0xdeadbeef;
1729
1730     if (len >= 4) {
1731 #if !UNALIGNED_WORD_ACCESS
1732         int align = (VALUE)data & 3;
1733         if (align) {
1734             uint32_t t = 0, d = 0;
1735             int sl, sr, pack;
1736
1737             switch (align) {
1738 #ifdef WORDS_BIGENDIAN
1739               case 1: t |= data[2];
1740               case 2: t |= data[1] << 8;
1741               case 3: t |= data[0] << 16;
1742 #else
1743               case 1: t |= data[2] << 16;
1744               case 2: t |= data[1] << 8;
1745               case 3: t |= data[0];
1746 #endif
1747             }
1748
1749 #ifdef WORDS_BIGENDIAN
1750             t >>= (8 * align) - 8;
1751 #else
1752             t <<= (8 * align);
1753 #endif
1754
1755             data += 4-align;
1756             len -= 4-align;
1757
1758             sl = 8 * (4-align);
1759             sr = 8 * align;
1760
1761             while (len >= 4) {
1762                 d = *(uint32_t *)data;
1763 #ifdef WORDS_BIGENDIAN
1764                 t = (t << sr) | (d >> sl);
1765 #else
1766                 t = (t >> sr) | (d << sl);
1767 #endif
1768                 h += t;
1769                 h *= m;
1770                 h ^= h >> r;
1771                 t = d;
1772
1773                 data += 4;
1774                 len -= 4;
1775             }
1776
1777             pack = len < align ? len : align;
1778             d = 0;
1779             switch (pack) {
1780 #ifdef WORDS_BIGENDIAN
1781               case 3: d |= data[2] << 8;
1782               case 2: d |= data[1] << 16;
1783               case 1: d |= data[0] << 24;
1784               case 0:
1785                 h += (t << sr) | (d >> sl);
1786 #else
1787               case 3: d |= data[2] << 16;
1788               case 2: d |= data[1] << 8;
1789               case 1: d |= data[0];
1790               case 0:
1791                 h += (t >> sr) | (d << sl);
1792 #endif
1793                 h *= m;
1794                 h ^= h >> r;
1795             }
1796
1797             data += pack;
1798             len -= pack;
1799         }
1800         else
1801 #endif
1802         {
1803             do {
1804                 h += *(uint32_t *)data;
1805                 h *= m;
1806                 h ^= h >> r;
1807
1808                 data += 4;
1809                 len -= 4;
1810             } while (len >= 4);
1811         }
1812     }
1813
1814     switch(len) {
1815 #ifdef WORDS_BIGENDIAN
1816       case 3:
1817         h += data[2] << 8;
1818       case 2:
1819         h += data[1] << 16;
1820       case 1:
1821         h += data[0] << 24;
1822 #else
1823       case 3:
1824         h += data[2] << 16;
1825       case 2:
1826         h += data[1] << 8;
1827       case 1:
1828         h += data[0];
1829 #endif
1830         h *= m;
1831         h ^= h >> r;
1832     }
1833
1834     h *= m;
1835     h ^= h >> 10;
1836     h *= m;
1837     h ^= h >> 17;
1838
1839     return h;
1840 }
1841
1842 int
1843 rb_memhash(const void *ptr, long len)
1844 {
1845     static int hashseed_init = 0;
1846     static unsigned int hashseed;
1847
1848     if (!hashseed_init) {
1849         hashseed = rb_genrand_int32();
1850         hashseed_init = 1;
1851     }
1852
1853     return hash(ptr, len, hashseed);
1854 }
1855
1856 int
1857 rb_str_hash(VALUE str)
1858 {
1859     return rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str));
1860 }
1861
1862 int
1863 rb_str_hash_cmp(VALUE str1, VALUE str2)
1864 {
1865     int len;
1866
1867     if (!rb_str_comparable(str1, str2)) return 1;
1868     if (RSTRING_LEN(str1) == (len = RSTRING_LEN(str2)) &&
1869         memcmp(RSTRING_PTR(str1), RSTRING_PTR(str2), len) == 0) {
1870         return 0;
1871     }
1872     return 1;
1873 }
1874
1875 /*
1876  * call-seq:
1877  *    str.hash   => fixnum
1878  *
1879  * Return a hash based on the string's length and content.
1880  */
1881
1882 static VALUE
1883 rb_str_hash_m(VALUE str)
1884 {
1885     int hval = rb_str_hash(str);
1886     return INT2FIX(hval);
1887 }
1888
1889 #define lesser(a,b) (((a)>(b))?(b):(a))
1890
1891 int
1892 rb_str_comparable(VALUE str1, VALUE str2)
1893 {
1894     int idx1, idx2;
1895     int rc1, rc2;
1896
1897     if (RSTRING_LEN(str1) == 0) return Qtrue;
1898     if (RSTRING_LEN(str2) == 0) return Qtrue;
1899     idx1 = ENCODING_GET(str1);
1900     idx2 = ENCODING_GET(str2);
1901     if (idx1 == idx2) return Qtrue;
1902     rc1 = rb_enc_str_coderange(str1);
1903     rc2 = rb_enc_str_coderange(str2);
1904     if (rc1 == ENC_CODERANGE_7BIT) {
1905         if (rc2 == ENC_CODERANGE_7BIT) return Qtrue;
1906         if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
1907             return Qtrue;
1908     }
1909     if (rc2 == ENC_CODERANGE_7BIT) {
1910         if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
1911             return Qtrue;
1912     }
1913     return Qfalse;
1914 }
1915
1916 int
1917 rb_str_cmp(VALUE str1, VALUE str2)
1918 {
1919     long len;
1920     int retval;
1921
1922     len = lesser(RSTRING_LEN(str1), RSTRING_LEN(str2));
1923     retval = memcmp(RSTRING_PTR(str1), RSTRING_PTR(str2), len);
1924     if (retval == 0) {
1925         if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) {
1926             if (!rb_enc_compatible(str1, str2)) {
1927                 if (ENCODING_GET(str1) - ENCODING_GET(str2) > 0)
1928                     return 1;
1929                 return -1;
1930             }
1931             return 0;
1932         }
1933         if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return 1;
1934         return -1;
1935     }
1936     if (retval > 0) return 1;
1937     return -1;
1938 }
1939
1940
1941 /*
1942  *  call-seq:
1943  *     str == obj   => true or false
1944  *
1945  *  Equality---If <i>obj</i> is not a <code>String</code>, returns
1946  *  <code>false</code>. Otherwise, returns <code>true</code> if <i>str</i>
1947  *  <code><=></code> <i>obj</i> returns zero.
1948  */
1949
1950 VALUE
1951 rb_str_equal(VALUE str1, VALUE str2)
1952 {
1953     int len;
1954
1955     if (str1 == str2) return Qtrue;
1956     if (TYPE(str2) != T_STRING) {
1957         if (!rb_respond_to(str2, rb_intern("to_str"))) {
1958             return Qfalse;
1959         }
1960         return rb_equal(str2, str1);
1961     }
1962     if (!rb_str_comparable(str1, str2)) return Qfalse;
1963     if (RSTRING_LEN(str1) == (len = RSTRING_LEN(str2)) &&
1964         memcmp(RSTRING_PTR(str1), RSTRING_PTR(str2), len) == 0) {
1965         return Qtrue;
1966     }
1967     return Qfalse;
1968 }
1969
1970 /*
1971  * call-seq:
1972  *   str.eql?(other)   => true or false
1973  *
1974  * Two strings are equal if the have the same length and content.
1975  */
1976
1977 static VALUE
1978 rb_str_eql(VALUE str1, VALUE str2)
1979 {
1980     if (TYPE(str2) != T_STRING || RSTRING_LEN(str1) != RSTRING_LEN(str2))
1981         return Qfalse;
1982
1983     if (!rb_str_comparable(str1, str2)) return Qfalse;
1984     if (memcmp(RSTRING_PTR(str1), RSTRING_PTR(str2),
1985                lesser(RSTRING_LEN(str1), RSTRING_LEN(str2))) == 0)
1986         return Qtrue;
1987
1988     return Qfalse;
1989 }
1990
1991 /*
1992  *  call-seq:
1993  *     str <=> other_str   => -1, 0, +1
1994  *
1995  *  Comparison---Returns -1 if <i>other_str</i> is less than, 0 if
1996  *  <i>other_str</i> is equal to, and +1 if <i>other_str</i> is greater than
1997  *  <i>str</i>. If the strings are of different lengths, and the strings are
1998  *  equal when compared up to the shortest length, then the longer string is
1999  *  considered greater than the shorter one. In older versions of Ruby, setting
2000  *  <code>$=</code> allowed case-insensitive comparisons; this is now deprecated
2001  *  in favor of using <code>String#casecmp</code>.
2002  *
2003  *  <code><=></code> is the basis for the methods <code><</code>,
2004  *  <code><=</code>, <code>></code>, <code>>=</code>, and <code>between?</code>,
2005  *  included from module <code>Comparable</code>.  The method
2006  *  <code>String#==</code> does not use <code>Comparable#==</code>.
2007  *
2008  *     "abcdef" <=> "abcde"     #=> 1
2009  *     "abcdef" <=> "abcdef"    #=> 0
2010  *     "abcdef" <=> "abcdefg"   #=> -1
2011  *     "abcdef" <=> "ABCDEF"    #=> 1
2012  */
2013
2014 static VALUE
2015 rb_str_cmp_m(VALUE str1, VALUE str2)
2016 {
2017     long result;
2018
2019     if (TYPE(str2) != T_STRING) {
2020         if (!rb_respond_to(str2, rb_intern("to_str"))) {
2021             return Qnil;
2022         }
2023         else if (!rb_respond_to(str2, rb_intern("<=>"))) {
2024             return Qnil;
2025         }
2026         else {
2027             VALUE tmp = rb_funcall(str2, rb_intern("<=>"), 1, str1);
2028
2029             if (NIL_P(tmp)) return Qnil;
2030             if (!FIXNUM_P(tmp)) {
2031                 return rb_funcall(LONG2FIX(0), '-', 1, tmp);
2032             }
2033             result = -FIX2LONG(tmp);
2034         }
2035     }
2036     else {
2037         result = rb_str_cmp(str1, str2);
2038     }
2039     return LONG2NUM(result);
2040 }
2041
2042 /*
2043  *  call-seq:
2044  *     str.casecmp(other_str)   => -1, 0, +1
2045  *
2046  *  Case-insensitive version of <code>String#<=></code>.
2047  *
2048  *     "abcdef".casecmp("abcde")     #=> 1
2049  *     "aBcDeF".casecmp("abcdef")    #=> 0
2050  *     "abcdef".casecmp("abcdefg")   #=> -1
2051  *     "abcdef".casecmp("ABCDEF")    #=> 0
2052  */
2053
2054 static VALUE
2055 rb_str_casecmp(VALUE str1, VALUE str2)
2056 {
2057     long len;
2058     rb_encoding *enc;
2059     char *p1, *p1end, *p2, *p2end;
2060
2061     StringValue(str2);
2062     enc = rb_enc_compatible(str1, str2);
2063     if (!enc) {
2064         return Qnil;
2065     }
2066
2067     p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
2068     p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
2069     while (p1 < p1end && p2 < p2end) {
2070         int c1 = rb_enc_codepoint(p1, p1end, enc);
2071         int c2 = rb_enc_codepoint(p2, p2end, enc);
2072
2073         if (c1 != c2) {
2074             c1 = rb_enc_toupper(c1, enc);
2075             c2 = rb_enc_toupper(c2, enc);
2076             if (c1 > c2) return INT2FIX(1);
2077             if (c1 < c2) return INT2FIX(-1);
2078         }
2079         len = rb_enc_codelen(c1, enc);
2080         p1 += len;
2081         p2 += len;
2082     }
2083     if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0);
2084     if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1);
2085     return INT2FIX(-1);
2086 }
2087
2088 static long
2089 rb_str_index(VALUE str, VALUE sub, long offset)
2090 {
2091     long pos;
2092     char *s, *sptr;
2093     long len, slen;
2094     rb_encoding *enc;
2095
2096     enc = rb_enc_check(str, sub);
2097     if (is_broken_string(sub)) {
2098         return -1;
2099     }
2100     len = str_strlen(str, enc);
2101     slen = str_strlen(sub, enc);
2102     if (offset < 0) {
2103         offset += len;
2104         if (offset < 0) return -1;
2105     }
2106     if (len - offset < slen) return -1;
2107     s = RSTRING_PTR(str);
2108     if (offset) {
2109         offset = str_offset(s, RSTRING_END(str), offset, enc, single_byte_optimizable(str));
2110         s += offset;
2111     }
2112     if (slen == 0) return offset;
2113     /* need proceed one character at a time */
2114     sptr = RSTRING_PTR(sub);
2115     slen = RSTRING_LEN(sub);
2116     len = RSTRING_LEN(str) - offset;
2117     for (;;) {
2118         char *t;
2119         pos = rb_memsearch(sptr, slen, s, len, enc);
2120         if (pos < 0) return pos;
2121         t = rb_enc_right_char_head(s, s+pos, enc);
2122         if (t == s + pos) break;
2123         if ((len -= t - s) <= 0) return -1;
2124         offset += t - s;
2125         s = t;
2126     }
2127     return pos + offset;
2128 }
2129
2130
2131 /*
2132  *  call-seq:
2133  *     str.index(substring [, offset])   => fixnum or nil
2134  *     str.index(regexp [, offset])      => fixnum or nil
2135  *
2136  *  Returns the index of the first occurrence of the given <i>substring</i> or
2137  *  pattern (<i>regexp</i>) in <i>str</i>. Returns <code>nil</code> if not
2138  *  found. If the second parameter is present, it specifies the position in the
2139  *  string to begin the search.
2140  *
2141  *     "hello".index('e')             #=> 1
2142  *     "hello".index('lo')            #=> 3
2143  *     "hello".index('a')             #=> nil
2144  *     "hello".index(?e)              #=> 1
2145  *     "hello".index(/[aeiou]/, -3)   #=> 4
2146  */
2147
2148 static VALUE
2149 rb_str_index_m(int argc, VALUE *argv, VALUE str)
2150 {
2151     VALUE sub;
2152     VALUE initpos;
2153     long pos;
2154
2155     if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
2156         pos = NUM2LONG(initpos);
2157     }
2158     else {
2159         pos = 0;
2160     }
2161     if (pos < 0) {
2162         pos += str_strlen(str, STR_ENC_GET(str));
2163         if (pos < 0) {
2164             if (TYPE(sub) == T_REGEXP) {
2165                 rb_backref_set(Qnil);
2166             }
2167             return Qnil;
2168         }
2169     }
2170
2171     switch (TYPE(sub)) {
2172       case T_REGEXP:
2173         pos = rb_reg_adjust_startpos(sub, str, pos, 0);
2174         pos = rb_reg_search(sub, str, pos, 0);
2175         pos = rb_str_sublen(str, pos);
2176         break;
2177
2178       default: {
2179         VALUE tmp;
2180
2181         tmp = rb_check_string_type(sub);
2182         if (NIL_P(tmp)) {
2183             rb_raise(rb_eTypeError, "type mismatch: %s given",
2184                      rb_obj_classname(sub));
2185         }
2186         sub = tmp;
2187       }
2188         /* fall through */
2189       case T_STRING:
2190         pos = rb_str_index(str, sub, pos);
2191         pos = rb_str_sublen(str, pos);
2192         break;
2193     }
2194
2195     if (pos == -1) return Qnil;
2196     return LONG2NUM(pos);
2197 }
2198
2199 static long
2200 rb_str_rindex(VALUE str, VALUE sub, long pos)
2201 {
2202     long len, slen;
2203     char *s, *sbeg, *e, *t;
2204     rb_encoding *enc;
2205     int singlebyte = single_byte_optimizable(str);
2206
2207     enc = rb_enc_check(str, sub);
2208     if (is_broken_string(sub)) {
2209         return -1;
2210     }
2211     len = str_strlen(str, enc);
2212     slen = str_strlen(sub, enc);
2213     /* substring longer than string */
2214     if (len < slen) return -1;
2215     if (len - pos < slen) {
2216         pos = len - slen;
2217     }
2218     if (len == 0) {
2219         return pos;
2220     }
2221     sbeg = RSTRING_PTR(str);
2222     e = RSTRING_END(str);
2223     t = RSTRING_PTR(sub);
2224     slen = RSTRING_LEN(sub);
2225     for (;;) {
2226         s = str_nth(sbeg, e, pos, enc, singlebyte);
2227         if (!s) return -1;
2228         if (memcmp(s, t, slen) == 0) {
2229             return pos;
2230         }
2231         if (pos == 0) break;
2232         pos--;
2233     }
2234     return -1;
2235 }
2236
2237
2238 /*
2239  *  call-seq:
2240  *     str.rindex(substring [, fixnum])   => fixnum or nil
2241  *     str.rindex(regexp [, fixnum])   => fixnum or nil
2242  *
2243  *  Returns the index of the last occurrence of the given <i>substring</i> or
2244  *  pattern (<i>regexp</i>) in <i>str</i>. Returns <code>nil</code> if not
2245  *  found. If the second parameter is present, it specifies the position in the
2246  *  string to end the search---characters beyond this point will not be
2247  *  considered.
2248  *
2249  *     "hello".rindex('e')             #=> 1
2250  *     "hello".rindex('l')             #=> 3
2251  *     "hello".rindex('a')             #=> nil
2252  *     "hello".rindex(?e)              #=> 1
2253  *     "hello".rindex(/[aeiou]/, -2)   #=> 1
2254  */
2255
2256 static VALUE
2257 rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
2258 {
2259     VALUE sub;
2260     VALUE vpos;
2261     rb_encoding *enc = STR_ENC_GET(str);
2262     long pos, len = str_strlen(str, enc);
2263
2264     if (rb_scan_args(argc, argv, "11", &sub, &vpos) == 2) {
2265         pos = NUM2LONG(vpos);
2266         if (pos < 0) {
2267             pos += len;
2268             if (pos < 0) {
2269                 if (TYPE(sub) == T_REGEXP) {
2270                     rb_backref_set(Qnil);
2271                 }
2272                 return Qnil;
2273             }
2274         }
2275         if (pos > len) pos = len;
2276     }
2277     else {
2278         pos = len;
2279     }
2280
2281     switch (TYPE(sub)) {
2282       case T_REGEXP:
2283         /* enc = rb_get_check(str, sub); */
2284         if (!RREGEXP(sub)->ptr || RREGEXP_SRC_LEN(sub)) {
2285             pos = rb_reg_adjust_startpos(sub, str, pos, 1);
2286             pos = rb_reg_search(sub, str, pos, 1);
2287             pos = rb_str_sublen(str, pos);
2288         }
2289         if (pos >= 0) return LONG2NUM(pos);
2290         break;
2291
2292       default: {
2293         VALUE tmp;
2294
2295         tmp = rb_check_string_type(sub);
2296         if (NIL_P(tmp)) {
2297             rb_raise(rb_eTypeError, "type mismatch: %s given",
2298                      rb_obj_classname(sub));
2299         }
2300         sub = tmp;
2301       }
2302         /* fall through */
2303       case T_STRING:
2304         pos = rb_str_rindex(str, sub, pos);
2305         if (pos >= 0) return LONG2NUM(pos);
2306         break;
2307     }
2308     return Qnil;
2309 }
2310
2311 /*
2312  *  call-seq:
2313  *     str =~ obj   => fixnum or nil
2314  *
2315  *  Match---If <i>obj</i> is a <code>Regexp</code>, use it as a pattern to match
2316  *  against <i>str</i>,and returns the position the match starts, or
2317  *  <code>nil</code> if there is no match. Otherwise, invokes
2318  *  <i>obj.=~</i>, passing <i>str</i> as an argument. The default
2319  *  <code>=~</code> in <code>Object</code> returns <code>false</code>.
2320  *
2321  *     "cat o' 9 tails" =~ /\d/   #=> 7
2322  *     "cat o' 9 tails" =~ 9      #=> nil
2323  */
2324
2325 static VALUE
2326 rb_str_match(VALUE x, VALUE y)
2327 {
2328     switch (TYPE(y)) {
2329       case T_STRING:
2330         rb_raise(rb_eTypeError, "type mismatch: String given");
2331
2332       case T_REGEXP:
2333         return rb_reg_match(y, x);
2334
2335       default:
2336         return rb_funcall(y, rb_intern("=~"), 1, x);
2337     }
2338 }
2339
2340
2341 static VALUE get_pat(VALUE, int);
2342
2343
2344 /*
2345  *  call-seq:
2346  *     str.match(pattern)   => matchdata or nil
2347  *
2348  *  Converts <i>pattern</i> to a <code>Regexp</code> (if it isn't already one),
2349  *  then invokes its <code>match</code> method on <i>str</i>.  If the second
2350  *  parameter is present, it specifies the position in the string to begin the
2351  *  search.
2352  *
2353  *     'hello'.match('(.)\1')      #=> #<MatchData "ll" 1:"l">
2354  *     'hello'.match('(.)\1')[0]   #=> "ll"
2355  *     'hello'.match(/(.)\1/)[0]   #=> "ll"
2356  *     'hello'.match('xx')         #=> nil
2357  *
2358  *  If a block is given, invoke the block with MatchData if match succeed, so
2359  *  that you can write
2360  *
2361  *     str.match(pat) {|m| ...}
2362  *
2363  *  instead of
2364  *
2365  *     if m = str.match(pat)
2366  *       ...
2367  *     end
2368  *
2369  *  The return value is a value from block execution in this case.
2370  */
2371
2372 static VALUE
2373 rb_str_match_m(int argc, VALUE *argv, VALUE str)
2374 {
2375     VALUE re, result;
2376     if (argc < 1)
2377         rb_raise(rb_eArgError, "wrong number of arguments (%d for 1)", argc);
2378     re = argv[0];
2379     argv[0] = str;
2380     result = rb_funcall2(get_pat(re, 0), rb_intern("match"), argc, argv);
2381     if (!NIL_P(result) && rb_block_given_p()) {
2382         return rb_yield(result);
2383     }
2384     return result;
2385 }
2386
2387 enum neighbor_char {
2388     NEIGHBOR_NOT_CHAR,
2389     NEIGHBOR_FOUND,
2390     NEIGHBOR_WRAPPED
2391 };
2392
2393 static enum neighbor_char
2394 enc_succ_char(char *p, int len, rb_encoding *enc)
2395 {
2396     int i, l;
2397     while (1) {
2398         for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
2399             p[i] = '\0';
2400         if (i < 0)
2401             return NEIGHBOR_WRAPPED;
2402         ++((unsigned char*)p)[i];
2403         l = rb_enc_precise_mbclen(p, p+len, enc);
2404         if (MBCLEN_CHARFOUND_P(l)) {
2405             l = MBCLEN_CHARFOUND_LEN(l);
2406             if (l == len) {
2407                 return NEIGHBOR_FOUND;
2408             }
2409             else {
2410                 memset(p+l, 0xff, len-l);
2411             }
2412         }
2413         if (MBCLEN_INVALID_P(l) && i < len-1) {
2414             int len2, l2;
2415             for (len2 = len-1; 0 < len2; len2--) {
2416                 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
2417                 if (!MBCLEN_INVALID_P(l2))
2418                     break;
2419             }
2420             memset(p+len2+1, 0xff, len-(len2+1));
2421         }
2422     }
2423 }
2424
2425 static enum neighbor_char
2426 enc_pred_char(char *p, int len, rb_encoding *enc)
2427 {
2428     int i, l;
2429     while (1) {
2430         for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
2431             p[i] = '\xff';
2432         if (i < 0)
2433             return NEIGHBOR_WRAPPED;
2434         --((unsigned char*)p)[i];
2435         l = rb_enc_precise_mbclen(p, p+len, enc);
2436         if (MBCLEN_CHARFOUND_P(l)) {
2437             l = MBCLEN_CHARFOUND_LEN(l);
2438             if (l == len) {
2439                 return NEIGHBOR_FOUND;
2440             }
2441             else {
2442                 memset(p+l, 0, len-l);
2443             }
2444         }
2445         if (MBCLEN_INVALID_P(l) && i < len-1) {
2446             int len2, l2;
2447             for (len2 = len-1; 0 < len2; len2--) {
2448                 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
2449                 if (!MBCLEN_INVALID_P(l2))
2450                     break;
2451             }
2452             memset(p+len2+1, 0, len-(len2+1));
2453         }
2454     }
2455 }
2456
2457 /*
2458   overwrite +p+ by succeeding letter in +enc+ and returns
2459   NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
2460   When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
2461   assuming each ranges are successive, and mbclen
2462   never change in each ranges.
2463   NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
2464   character.
2465  */
2466 static enum neighbor_char
2467 enc_succ_alnum_char(char *p, int len, rb_encoding *enc, char *carry)
2468 {
2469     enum neighbor_char ret;
2470     int c;
2471     int ctype;
2472     int range;
2473     char save[ONIGENC_CODE_TO_MBC_MAXLEN];
2474
2475     c = rb_enc_mbc_to_codepoint(p, p+len, enc);
2476     if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
2477         ctype = ONIGENC_CTYPE_DIGIT;
2478     else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
2479         ctype = ONIGENC_CTYPE_ALPHA;
2480     else
2481         return NEIGHBOR_NOT_CHAR;
2482
2483     MEMCPY(save, p, char, len);
2484     ret = enc_succ_char(p, len, enc);
2485     if (ret == NEIGHBOR_FOUND) {
2486         c = rb_enc_mbc_to_codepoint(p, p+len, enc);
2487         if (rb_enc_isctype(c, ctype, enc))
2488             return NEIGHBOR_FOUND;
2489     }
2490     MEMCPY(p, save, char, len);
2491     range = 1;
2492     while (1) {
2493         MEMCPY(save, p, char, len);
2494         ret = enc_pred_char(p, len, enc);
2495         if (ret == NEIGHBOR_FOUND) {
2496             c = rb_enc_mbc_to_codepoint(p, p+len, enc);
2497             if (!rb_enc_isctype(c, ctype, enc)) {
2498                 MEMCPY(p, save, char, len);
2499                 break;
2500             }
2501         }
2502         else {
2503             MEMCPY(p, save, char, len);
2504             break;
2505         }
2506         range++;
2507     }
2508     if (range == 1) {
2509         return NEIGHBOR_NOT_CHAR;
2510     }
2511
2512     if (ctype != ONIGENC_CTYPE_DIGIT) {
2513         MEMCPY(carry, p, char, len);
2514         return NEIGHBOR_WRAPPED;
2515     }
2516
2517     MEMCPY(carry, p, char, len);
2518     enc_succ_char(carry, len, enc);
2519     return NEIGHBOR_WRAPPED;
2520 }
2521
2522
2523 /*
2524  *  call-seq:
2525  *     str.succ   => new_str
2526  *     str.next   => new_str
2527  *
2528  *  Returns the successor to <i>str</i>. The successor is calculated by
2529  *  incrementing characters starting from the rightmost alphanumeric (or
2530  *  the rightmost character if there are no alphanumerics) in the
2531  *  string. Incrementing a digit always results in another digit, and
2532  *  incrementing a letter results in another letter of the same case.
2533  *  Incrementing nonalphanumerics uses the underlying character set's
2534  *  collating sequence.
2535  *
2536  *  If the increment generates a ``carry,'' the character to the left of
2537  *  it is incremented. This process repeats until there is no carry,
2538  *  adding an additional character if necessary.
2539  *
2540  *     "abcd".succ        #=> "abce"
2541  *     "THX1138".succ     #=> "THX1139"
2542  *     "<<koala>>".succ   #=> "<<koalb>>"
2543  *     "1999zzz".succ     #=> "2000aaa"
2544  *     "ZZZ9999".succ     #=> "AAAA0000"
2545  *     "***".succ         #=> "**+"
2546  */
2547
2548 VALUE
2549 rb_str_succ(VALUE orig)
2550 {
2551     rb_encoding *enc;
2552     VALUE str;
2553     char *sbeg, *s, *e, *last_alnum = 0;
2554     int c = -1;
2555     long l;
2556     char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
2557     int carry_pos = 0, carry_len = 1;
2558     enum neighbor_char neighbor = NEIGHBOR_FOUND;
2559
2560     str = rb_str_new5(orig, RSTRING_PTR(orig), RSTRING_LEN(orig));
2561     rb_enc_cr_str_copy_for_substr(str, orig);
2562     OBJ_INFECT(str, orig);
2563     if (RSTRING_LEN(str) == 0) return str;
2564
2565     enc = STR_ENC_GET(orig);
2566     sbeg = RSTRING_PTR(str);
2567     s = e = sbeg + RSTRING_LEN(str);
2568
2569     while ((s = rb_enc_prev_char(sbeg, s, enc)) != 0) {
2570         if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
2571             if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
2572                 ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
2573                 s = last_alnum;
2574                 break;
2575             }
2576         }
2577         if ((l = rb_enc_precise_mbclen(s, e, enc)) <= 0) continue;
2578         neighbor = enc_succ_alnum_char(s, l, enc, carry);
2579         switch (neighbor) {
2580           case NEIGHBOR_NOT_CHAR:
2581             continue;
2582           case NEIGHBOR_FOUND:
2583             return str;
2584           case NEIGHBOR_WRAPPED:
2585             last_alnum = s;
2586             break;
2587         }
2588         c = 1;
2589         carry_pos = s - sbeg;
2590         carry_len = l;
2591     }
2592     if (c == -1) {              /* str contains no alnum */
2593         s = e;
2594         while ((s = rb_enc_prev_char(sbeg, s, enc)) != 0) {
2595             enum neighbor_char neighbor;
2596             if ((l = rb_enc_precise_mbclen(s, e, enc)) <= 0) continue;
2597             neighbor = enc_succ_char(s, l, enc);
2598             if (neighbor == NEIGHBOR_FOUND)
2599                 return str;
2600             if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
2601                 /* wrapped to \0...\0.  search next valid char. */
2602                 enc_succ_char(s, l, enc);
2603             }
2604             if (!rb_enc_asciicompat(enc)) {
2605                 MEMCPY(carry, s, char, l);
2606                 carry_len = l;
2607             }
2608             carry_pos = s - sbeg;
2609         }
2610     }
2611     RESIZE_CAPA(str, RSTRING_LEN(str) + carry_len);
2612     s = RSTRING_PTR(str) + carry_pos;
2613     memmove(s + carry_len, s, RSTRING_LEN(str) - carry_pos);
2614     memmove(s, carry, carry_len);
2615     STR_SET_LEN(str, RSTRING_LEN(str) + carry_len);
2616     RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
2617     rb_enc_str_coderange(str);
2618     return str;
2619 }
2620
2621
2622 /*
2623  *  call-seq:
2624  *     str.succ!   => str
2625  *     str.next!   => str
2626  *
2627  *  Equivalent to <code>String#succ</code>, but modifies the receiver in
2628  *  place.
2629  */
2630
2631 static VALUE
2632 rb_str_succ_bang(VALUE str)
2633 {
2634     rb_str_shared_replace(str, rb_str_succ(str));
2635
2636     return str;
2637 }
2638
2639
2640 /*
2641  *  call-seq:
2642  *     str.upto(other_str, exclusive=false) {|s| block }   => str
2643  *
2644  *  Iterates through successive values, starting at <i>str</i> and
2645  *  ending at <i>other_str</i> inclusive, passing each value in turn to
2646  *  the block. The <code>String#succ</code> method is used to generate
2647  *  each value.  If optional second argument exclusive is omitted or is <code>false</code>,
2648  *  the last value will be included; otherwise it will be excluded.
2649  *
2650  *     "a8".upto("b6") {|s| print s, ' ' }
2651  *     for s in "a8".."b6"
2652  *       print s, ' '
2653  *     end
2654  *
2655  *  <em>produces:</em>
2656  *
2657  *     a8 a9 b0 b1 b2 b3 b4 b5 b6
2658  *     a8 a9 b0 b1 b2 b3 b4 b5 b6
2659  */
2660
2661 static VALUE
2662 rb_str_upto(int argc, VALUE *argv, VALUE beg)
2663 {
2664     VALUE end, exclusive;
2665     VALUE current, after_end;
2666     ID succ;
2667     int n, excl;
2668     rb_encoding *enc;
2669
2670     rb_scan_args(argc, argv, "11", &end, &exclusive);
2671     excl = RTEST(exclusive);
2672     CONST_ID(succ, "succ");
2673     StringValue(end);
2674     enc = rb_enc_check(beg, end);
2675     if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 &&
2676         is_ascii_string(beg) && is_ascii_string(end)) {
2677         char c = RSTRING_PTR(beg)[0];
2678         char e = RSTRING_PTR(end)[0];
2679
2680         if (c > e || (excl && c == e)) return beg;
2681         for (;;) {
2682             rb_yield(rb_enc_str_new(&c, 1, enc));
2683             if (!excl && c == e) break;
2684             c++;
2685             if (excl && c == e) break;
2686         }
2687         return beg;
2688     }
2689     n = rb_str_cmp(beg, end);
2690     if (n > 0 || (excl && n == 0)) return beg;
2691
2692     after_end = rb_funcall(end, succ, 0, 0);
2693     current = beg;
2694     while (!rb_str_equal(current, after_end)) {
2695         rb_yield(current);
2696         if (!excl && rb_str_equal(current, end)) break;
2697         current = rb_funcall(current, succ, 0, 0);
2698         StringValue(current);
2699         if (excl && rb_str_equal(current, end)) break;
2700         if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
2701             break;
2702     }
2703
2704     return beg;
2705 }
2706
2707 static VALUE
2708 rb_str_subpat(VALUE str, VALUE re, int nth)
2709 {
2710     if (rb_reg_search(re, str, 0, 0) >= 0) {
2711         return rb_reg_nth_match(nth, rb_backref_get());
2712     }
2713     return Qnil;
2714 }
2715
2716 static VALUE
2717 rb_str_aref(VALUE str, VALUE indx)
2718 {
2719     long idx;
2720
2721     switch (TYPE(indx)) {
2722       case T_FIXNUM:
2723         idx = FIX2LONG(indx);
2724
2725       num_index:
2726         str = rb_str_substr(str, idx, 1);
2727         if (!NIL_P(str) && RSTRING_LEN(str) == 0) return Qnil;
2728         return str;
2729
2730       case T_REGEXP:
2731         return rb_str_subpat(str, indx, 0);
2732
2733       case T_STRING:
2734         if (rb_str_index(str, indx, 0) != -1)
2735             return rb_str_dup(indx);
2736         return Qnil;
2737
2738       default:
2739         /* check if indx is Range */
2740         {
2741             long beg, len;
2742             VALUE tmp;
2743
2744             len = str_strlen(str, STR_ENC_GET(str));
2745             switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
2746               case Qfalse:
2747                 break;
2748               case Qnil:
2749                 return Qnil;
2750               default:
2751                 tmp = rb_str_substr(str, beg, len);
2752                 return tmp;
2753             }
2754         }
2755         idx = NUM2LONG(indx);
2756         goto num_index;
2757     }
2758     return Qnil;                /* not reached */
2759 }
2760
2761
2762 /*
2763  *  call-seq:
2764  *     str[fixnum]                 => new_str or nil
2765  *     str[fixnum, fixnum]         => new_str or nil
2766  *     str[range]                  => new_str or nil
2767  *     str[regexp]                 => new_str or nil
2768  *     str[regexp, fixnum]         => new_str or nil
2769  *     str[other_str]              => new_str or nil
2770  *     str.slice(fixnum)           => new_str or nil
2771  *     str.slice(fixnum, fixnum)   => new_str or nil
2772  *     str.slice(range)            => new_str or nil
2773  *     str.slice(regexp)           => new_str or nil
2774  *     str.slice(regexp, fixnum)   => new_str or nil
2775  *     str.slice(other_str)        => new_str or nil
2776  *
2777  *  Element Reference---If passed a single <code>Fixnum</code>, returns a
2778  *  substring of one character at that position. If passed two <code>Fixnum</code>
2779  *  objects, returns a substring starting at the offset given by the first, and
2780  *  a length given by the second. If given a range, a substring containing
2781  *  characters at offsets given by the range is returned. In all three cases, if
2782  *  an offset is negative, it is counted from the end of <i>str</i>. Returns
2783  *  <code>nil</code> if the initial offset falls outside the string, the length
2784  *  is negative, or the beginning of the range is greater than the end.
2785  *
2786  *  If a <code>Regexp</code> is supplied, the matching portion of <i>str</i> is
2787  *  returned. If a numeric parameter follows the regular expression, that
2788  *  component of the <code>MatchData</code> is returned instead. If a
2789  *  <code>String</code> is given, that string is returned if it occurs in
2790  *  <i>str</i>. In both cases, <code>nil</code> is returned if there is no
2791  *  match.
2792  *
2793  *     a = "hello there"
2794  *     a[1]                   #=> "e"
2795  *     a[1,3]                 #=> "ell"
2796  *     a[1..3]                #=> "ell"
2797  *     a[-3,2]                #=> "er"
2798  *     a[-4..-2]              #=> "her"
2799  *     a[12..-1]              #=> nil
2800  *     a[-2..-4]              #=> ""
2801  *     a[/[aeiou](.)\1/]      #=> "ell"
2802  *     a[/[aeiou](.)\1/, 0]   #=> "ell"
2803  *     a[/[aeiou](.)\1/, 1]   #=> "l"
2804  *     a[/[aeiou](.)\1/, 2]   #=> nil
2805  *     a["lo"]                #=> "lo"
2806  *     a["bye"]               #=> nil
2807  */
2808
2809 static VALUE
2810 rb_str_aref_m(int argc, VALUE *argv, VALUE str)
2811 {
2812     if (argc == 2) {
2813         if (TYPE(argv[0]) == T_REGEXP) {
2814             return rb_str_subpat(str, argv[0], NUM2INT(argv[1]));
2815         }
2816         return rb_str_substr(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]));
2817     }
2818     if (argc != 1) {
2819         rb_raise(rb_eArgError, "wrong number of arguments (%d for 1)", argc);
2820     }
2821     return rb_str_aref(str, argv[0]);
2822 }
2823
2824 VALUE
2825 rb_str_drop_bytes(VALUE str, long len)
2826 {
2827     char *ptr = RSTRING_PTR(str);
2828     long olen = RSTRING_LEN(str), nlen;
2829
2830     str_modifiable(str);
2831     if (len > olen) len = olen;
2832     nlen = olen - len;
2833     if (nlen <= RSTRING_EMBED_LEN_MAX) {
2834         char *oldptr = ptr;
2835         int fl = (RBASIC(str)->flags & (STR_NOEMBED|ELTS_SHARED));
2836         STR_SET_EMBED(str);
2837         STR_SET_EMBED_LEN(str, nlen);
2838         ptr = RSTRING(str)->as.ary;
2839         memmove(ptr, oldptr + len, nlen);
2840         if (fl == STR_NOEMBED) xfree(oldptr);
2841     }
2842     else {
2843         if (!STR_SHARED_P(str)) rb_str_new4(str);
2844         ptr = RSTRING(str)->as.heap.ptr += len;
2845         RSTRING(str)->as.heap.len = nlen;
2846     }
2847     ptr[nlen] = 0;
2848     ENC_CODERANGE_CLEAR(str);
2849     return str;
2850 }
2851
2852 static void
2853 rb_str_splice_0(VALUE str, long beg, long len, VALUE val)
2854 {
2855     if (beg == 0 && RSTRING_LEN(val) == 0) {
2856         rb_str_drop_bytes(str, len);
2857         OBJ_INFECT(str, val);
2858         return;
2859     }
2860
2861     rb_str_modify(str);
2862     if (len < RSTRING_LEN(val)) {
2863         /* expand string */
2864         RESIZE_CAPA(str, RSTRING_LEN(str) + RSTRING_LEN(val) - len + 1);
2865     }
2866
2867     if (RSTRING_LEN(val) != len) {
2868         memmove(RSTRING_PTR(str) + beg + RSTRING_LEN(val),
2869                 RSTRING_PTR(str) + beg + len,
2870                 RSTRING_LEN(str) - (beg + len));
2871     }
2872     if (RSTRING_LEN(val) < beg && len < 0) {
2873         MEMZERO(RSTRING_PTR(str) + RSTRING_LEN(str), char, -len);
2874     }
2875     if (RSTRING_LEN(val) > 0) {
2876         memmove(RSTRING_PTR(str)+beg, RSTRING_PTR(val), RSTRING_LEN(val));
2877     }
2878     STR_SET_LEN(str, RSTRING_LEN(str) + RSTRING_LEN(val) - len);
2879     if (RSTRING_PTR(str)) {
2880         RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
2881     }
2882     OBJ_INFECT(str, val);
2883 }
2884
2885 static void
2886 rb_str_splice(VALUE str, long beg, long len, VALUE val)
2887 {
2888     long slen;
2889     char *p, *e;
2890     rb_encoding *enc;
2891     int singlebyte = single_byte_optimizable(str);
2892
2893     if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
2894
2895     StringValue(val);
2896     rb_str_modify(str);
2897     enc = rb_enc_check(str, val);
2898     slen = str_strlen(str, enc);
2899
2900     if (slen < beg) {
2901       out_of_range:
2902         rb_raise(rb_eIndexError, "index %ld out of string", beg);
2903     }
2904     if (beg < 0) {
2905         if (-beg > slen) {
2906             goto out_of_range;
2907         }
2908         beg += slen;
2909     }
2910     if (slen < len || slen < beg + len) {
2911         len = slen - beg;
2912     }
2913     p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
2914     if (!p) p = RSTRING_END(str);
2915     e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
2916     if (!e) e = RSTRING_END(str);
2917     /* error check */
2918     beg = p - RSTRING_PTR(str); /* physical position */
2919     len = e - p;                /* physical length */
2920     rb_str_splice_0(str, beg, len, val);
2921     rb_enc_associate(str, enc);
2922 }
2923
2924 void
2925 rb_str_update(VALUE str, long beg, long len, VALUE val)
2926 {
2927     rb_str_splice(str, beg, len, val);
2928 }
2929
2930 static void
2931 rb_str_subpat_set(VALUE str, VALUE re, int nth, VALUE val)
2932 {
2933     VALUE match;
2934     long start, end, len;
2935     rb_encoding *enc;
2936     struct re_registers *regs;
2937
2938     if (rb_reg_search(re, str, 0, 0) < 0) {
2939         rb_raise(rb_eIndexError, "regexp not matched");
2940     }
2941     match = rb_backref_get();
2942     regs = RMATCH_REGS(match);
2943     if (nth >= regs->num_regs) {
2944       out_of_range:
2945         rb_raise(rb_eIndexError, "index %d out of regexp", nth);
2946     }
2947     if (nth < 0) {
2948         if (-nth >= regs->num_regs) {
2949             goto out_of_range;
2950         }
2951         nth += regs->num_regs;
2952     }
2953
2954     start = BEG(nth);
2955     if (start == -1) {
2956         rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
2957     }
2958     end = END(nth);
2959     len = end - start;
2960     StringValue(val);
2961     enc = rb_enc_check(str, val);
2962     rb_str_splice_0(str, start, len, val);
2963     rb_enc_associate(str, enc);
2964 }
2965
2966 static VALUE
2967 rb_str_aset(VALUE str, VALUE indx, VALUE val)
2968 {
2969     long idx, beg;
2970
2971     switch (TYPE(indx)) {
2972       case T_FIXNUM:
2973         idx = FIX2LONG(indx);
2974       num_index:
2975         rb_str_splice(str, idx, 1, val);
2976         return val;
2977
2978       case T_REGEXP:
2979         rb_str_subpat_set(str, indx, 0, val);
2980         return val;
2981
2982       case T_STRING:
2983         beg = rb_str_index(str, indx, 0);
2984         if (beg < 0) {
2985             rb_raise(rb_eIndexError, "string not matched");
2986         }
2987         beg = rb_str_sublen(str, beg);
2988         rb_str_splice(str, beg, str_strlen(indx, 0), val);
2989         return val;
2990
2991       default:
2992         /* check if indx is Range */
2993         {
2994             long beg, len;
2995             if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, 0), 2)) {
2996                 rb_str_splice(str, beg, len, val);
2997                 return val;
2998             }
2999         }
3000         idx = NUM2LONG(indx);
3001         goto num_index;
3002     }
3003 }
3004
3005 /*
3006  *  call-seq:
3007  *     str[fixnum] = new_str
3008  *     str[fixnum, fixnum] = new_str
3009  *     str[range] = aString
3010  *     str[regexp] = new_str
3011  *     str[regexp, fixnum] = new_str
3012  *     str[other_str] = new_str
3013  *
3014  *  Element Assignment---Replaces some or all of the content of <i>str</i>. The
3015  *  portion of the string affected is determined using the same criteria as
3016  *  <code>String#[]</code>. If the replacement string is not the same length as
3017  *  the text it is replacing, the string will be adjusted accordingly. If the
3018  *  regular expression or string is used as the index doesn't match a position
3019  *  in the string, <code>IndexError</code> is raised. If the regular expression
3020  *  form is used, the optional second <code>Fixnum</code> allows you to specify
3021  *  which portion of the match to replace (effectively using the
3022  *  <code>MatchData</code> indexing rules. The forms that take a
3023  *  <code>Fixnum</code> will raise an <code>IndexError</code> if the value is
3024  *  out of range; the <code>Range</code> form will raise a
3025  *  <code>RangeError</code>, and the <code>Regexp</code> and <code>String</code>
3026  *  forms will silently ignore the assignment.
3027  */
3028
3029 static VALUE
3030 rb_str_aset_m(int argc, VALUE *argv, VALUE str)
3031 {
3032     if (argc == 3) {
3033         if (TYPE(argv[0]) == T_REGEXP) {
3034             rb_str_subpat_set(str, argv[0], NUM2INT(argv[1]), argv[2]);
3035         }
3036         else {
3037             rb_str_splice(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
3038         }
3039         return argv[2];
3040     }
3041     if (argc != 2) {
3042         rb_raise(rb_eArgError, "wrong number of arguments (%d for 2)", argc);
3043     }
3044     return rb_str_aset(str, argv[0], argv[1]);
3045 }
3046
3047 /*
3048  *  call-seq:
3049  *     str.insert(index, other_str)   => str
3050  *
3051  *  Inserts <i>other_str</i> before the character at the given
3052  *  <i>index</i>, modifying <i>str</i>. Negative indices count from the
3053  *  end of the string, and insert <em>after</em> the given character.
3054  *  The intent is insert <i>aString</i> so that it starts at the given
3055  *  <i>index</i>.
3056  *
3057  *     "abcd".insert(0, 'X')    #=> "Xabcd"
3058  *     "abcd".insert(3, 'X')    #=> "abcXd"
3059  *     "abcd".insert(4, 'X')    #=> "abcdX"
3060  *     "abcd".insert(-3, 'X')   #=> "abXcd"
3061  *     "abcd".insert(-1, 'X')   #=> "abcdX"
3062  */
3063
3064 static VALUE
3065 rb_str_insert(VALUE str, VALUE idx, VALUE str2)
3066 {
3067     long pos = NUM2LONG(idx);
3068
3069     if (pos == -1) {
3070         return rb_str_append(str, str2);
3071     }
3072     else if (pos < 0) {
3073         pos++;
3074     }
3075     rb_str_splice(str, pos, 0, str2);
3076     return str;
3077 }
3078
3079
3080 /*
3081  *  call-seq:
3082  *     str.slice!(fixnum)           => fixnum or nil
3083  *     str.slice!(fixnum, fixnum)   => new_str or nil
3084  *     str.slice!(range)            => new_str or nil
3085  *     str.slice!(regexp)           => new_str or nil
3086  *     str.slice!(other_str)        => new_str or nil
3087  *
3088  *  Deletes the specified portion from <i>str</i>, and returns the portion
3089  *  deleted.
3090  *
3091  *     string = "this is a string"
3092  *     string.slice!(2)        #=> "i"
3093  *     string.slice!(3..6)     #=> " is "
3094  *     string.slice!(/s.*t/)   #=> "sa st"
3095  *     string.slice!("r")      #=> "r"
3096  *     string                  #=> "thing"
3097  */
3098
3099 static VALUE
3100 rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
3101 {
3102     VALUE result;
3103     VALUE buf[3];
3104     int i;
3105
3106     if (argc < 1 || 2 < argc) {
3107         rb_raise(rb_eArgError, "wrong number of arguments (%d for 1)", argc);
3108     }
3109     for (i=0; i<argc; i++) {
3110         buf[i] = argv[i];
3111     }
3112     rb_str_modify(str);
3113     buf[i] = rb_str_new(0,0);
3114     result = rb_str_aref_m(argc, buf, str);
3115     if (!NIL_P(result)) {
3116         rb_str_aset_m(argc+1, buf, str);
3117     }
3118     return result;
3119 }
3120
3121 static VALUE
3122 get_pat(VALUE pat, int quote)
3123 {
3124     VALUE val;
3125
3126     switch (TYPE(pat)) {
3127       case T_REGEXP:
3128         return pat;
3129
3130       case T_STRING:
3131         break;
3132
3133       default:
3134         val = rb_check_string_type(pat);
3135         if (NIL_P(val)) {
3136             Check_Type(pat, T_REGEXP);
3137         }
3138         pat = val;
3139     }
3140
3141     if (quote) {
3142         pat = rb_reg_quote(pat);
3143     }
3144
3145     return rb_reg_regcomp(pat);
3146 }
3147
3148
3149 /*
3150  *  call-seq:
3151  *     str.sub!(pattern, replacement)          => str or nil
3152  *     str.sub!(pattern) {|match| block }      => str or nil
3153  *
3154  *  Performs the substitutions of <code>String#sub</code> in place,
3155  *  returning <i>str</i>, or <code>nil</code> if no substitutions were
3156  *  performed.
3157  */
3158
3159 static VALUE
3160 rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
3161 {
3162     VALUE pat, repl, hash = Qnil;
3163     int iter = 0;
3164     int tainted = 0;
3165     int untrusted = 0;
3166     long plen;
3167
3168     if (argc == 1 && rb_block_given_p()) {
3169         iter = 1;
3170     }
3171     else if (argc == 2) {
3172         repl = argv[1];
3173         hash = rb_check_convert_type(argv[1], T_HASH, "Hash", "to_hash");
3174         if (NIL_P(hash)) {
3175             StringValue(repl);
3176         }
3177         if (OBJ_TAINTED(repl)) tainted = 1;
3178         if (OBJ_UNTRUSTED(repl)) untrusted = 1;
3179     }
3180     else {
3181         rb_raise(rb_eArgError, "wrong number of arguments (%d for 2)", argc);
3182     }
3183
3184     pat = get_pat(argv[0], 1);
3185     if (rb_reg_search(pat, str, 0, 0) >= 0) {
3186         rb_encoding *enc;
3187         int cr = ENC_CODERANGE(str);
3188         VALUE match = rb_backref_get();
3189         struct re_registers *regs = RMATCH_REGS(match);
3190         long beg0 = BEG(0);
3191         long end0 = END(0);
3192
3193         if (iter || !NIL_P(hash)) {
3194             char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
3195
3196             if (iter) {
3197                 repl = rb_obj_as_string(rb_yield(rb_reg_nth_match(0, match)));
3198             }
3199             else {
3200                 repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
3201                 repl = rb_obj_as_string(repl);
3202             }
3203             str_mod_check(str, p, len);
3204             str_frozen_check(str);
3205         }
3206         else {
3207             repl = rb_reg_regsub(repl, str, regs, pat);
3208         }
3209         enc = rb_enc_compatible(str, repl);
3210         if (!enc) {
3211             rb_encoding *str_enc = STR_ENC_GET(str);
3212             if (coderange_scan(RSTRING_PTR(str), beg0, str_enc) != ENC_CODERANGE_7BIT ||
3213                 coderange_scan(RSTRING_PTR(str)+end0,
3214                                RSTRING_LEN(str)-end0, str_enc) != ENC_CODERANGE_7BIT) {
3215                 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
3216                          rb_enc_name(str_enc),
3217                          rb_enc_name(STR_ENC_GET(repl)));
3218             }
3219             enc = STR_ENC_GET(repl);
3220         }
3221         rb_str_modify(str);
3222         rb_enc_associate(str, enc);
3223         if (OBJ_TAINTED(repl)) tainted = 1;
3224         if (OBJ_UNTRUSTED(repl)) untrusted = 1;
3225         if (ENC_CODERANGE_UNKNOWN < cr && cr < ENC_CODERANGE_BROKEN) {
3226             int cr2 = ENC_CODERANGE(repl);
3227             if (cr2 == ENC_CODERANGE_UNKNOWN || cr2 > cr) cr = cr2;
3228         }
3229         plen = end0 - beg0;
3230         if (RSTRING_LEN(repl) > plen) {
3231             RESIZE_CAPA(str, RSTRING_LEN(str) + RSTRING_LEN(repl) - plen);
3232         }
3233         if (RSTRING_LEN(repl) != plen) {
3234             memmove(RSTRING_PTR(str) + beg0 + RSTRING_LEN(repl),
3235                     RSTRING_PTR(str) + beg0 + plen,
3236                     RSTRING_LEN(str) - beg0 - plen);
3237         }
3238         memcpy(RSTRING_PTR(str) + beg0,
3239                RSTRING_PTR(repl), RSTRING_LEN(repl));
3240         STR_SET_LEN(str, RSTRING_LEN(str) + RSTRING_LEN(repl) - plen);
3241         RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
3242         ENC_CODERANGE_SET(str, cr);
3243         if (tainted) OBJ_TAINT(str);
3244         if (untrusted) OBJ_UNTRUST(str);
3245
3246         return str;
3247     }
3248     return Qnil;
3249 }
3250
3251
3252 /*
3253  *  call-seq:
3254  *     str.sub(pattern, replacement)         => new_str
3255  *     str.sub(pattern) {|match| block }     => new_str
3256  *
3257  *  Returns a copy of <i>str</i> with the <em>first</em> occurrence of
3258  *  <i>pattern</i> replaced with either <i>replacement</i> or the value of the
3259  *  block. The <i>pattern</i> will typically be a <code>Regexp</code>; if it is
3260  *  a <code>String</code> then no regular expression metacharacters will be
3261  *  interpreted (that is <code>/\d/</code> will match a digit, but
3262  *  <code>'\d'</code> will match a backslash followed by a 'd').
3263  *
3264  *  If the method call specifies <i>replacement</i>, special variables such as
3265  *  <code>$&</code> will not be useful, as substitution into the string occurs
3266  *  before the pattern match starts. However, the sequences <code>\1</code>,
3267  *  <code>\2</code>, <code>\k<group_name></code>, etc., may be used.
3268  *
3269  *  In the block form, the current match string is passed in as a parameter, and
3270  *  variables such as <code>$1</code>, <code>$2</code>, <code>$`</code>,
3271  *  <code>$&</code>, and <code>$'</code> will be set appropriately. The value
3272  *  returned by the block will be substituted for the match on each call.
3273  *
3274  *  The result inherits any tainting in the original string or any supplied
3275  *  replacement string.
3276  *
3277  *     "hello".sub(/[aeiou]/, '*')                  #=> "h*llo"
3278  *     "hello".sub(/([aeiou])/, '<\1>')             #=> "h<e>llo"
3279  *     "hello".sub(/./) {|s| s[0].ord.to_s + ' ' }  #=> "104 ello"
3280  *     "hello".sub(/(?<foo>[aeiou])/, '*\k<foo>*')  #=> "h*e*llo"
3281  */
3282
3283 static VALUE
3284 rb_str_sub(int argc, VALUE *argv, VALUE str)
3285 {
3286     str = rb_str_dup(str);
3287     rb_str_sub_bang(argc, argv, str);
3288     return str;
3289 }
3290
3291 static VALUE
3292 str_gsub(int argc, VALUE *argv, VALUE str, int bang)
3293 {
3294     VALUE pat, val, repl, match, dest, hash = Qnil;
3295     struct re_registers *regs;
3296     long beg, n;
3297     long beg0, end0;
3298     long offset, blen, slen, len, last;
3299     int iter = 0;
3300     char *sp, *cp;
3301     int tainted = 0;
3302     rb_encoding *str_enc;
3303
3304     switch (argc) {
3305       case 1:
3306         RETURN_ENUMERATOR(str, argc, argv);
3307         iter = 1;
3308         break;
3309       case 2:
3310         repl = argv[1];
3311         hash = rb_check_convert_type(argv[1], T_HASH, "Hash", "to_hash");
3312         if (NIL_P(hash)) {
3313             StringValue(repl);
3314         }
3315         if (OBJ_TAINTED(repl)) tainted = 1;
3316         break;
3317       default:
3318         rb_raise(rb_eArgError, "wrong number of arguments (%d for 2)", argc);
3319     }
3320
3321     pat = get_pat(argv[0], 1);
3322     beg = rb_reg_search(pat, str, 0, 0);
3323     if (beg < 0) {
3324         if (bang) return Qnil;  /* no match, no substitution */
3325         return rb_str_dup(str);
3326     }
3327
3328     offset = 0;
3329     n = 0;
3330     blen = RSTRING_LEN(str) + 30; /* len + margin */
3331     dest = rb_str_buf_new(blen);
3332     sp = RSTRING_PTR(str);
3333     slen = RSTRING_LEN(str);
3334     cp = sp;
3335     str_enc = STR_ENC_GET(str);
3336
3337     do {
3338         n++;
3339         match = rb_backref_get();
3340         regs = RMATCH_REGS(match);
3341         beg0 = BEG(0);
3342         end0 = END(0);
3343         if (iter || !NIL_P(hash)) {
3344             if (iter) {
3345                 val = rb_obj_as_string(rb_yield(rb_reg_nth_match(0, match)));
3346             }
3347             else {
3348                 val = rb_hash_aref(hash, rb_str_subseq(str, BEG(0), END(0) - BEG(0)));
3349                 val = rb_obj_as_string(val);
3350             }
3351             str_mod_check(str, sp, slen);
3352             if (bang) str_frozen_check(str);
3353             if (val == dest) {  /* paranoid check [ruby-dev:24827] */
3354                 rb_raise(rb_eRuntimeError, "block should not cheat");
3355             }
3356         }
3357         else {
3358             val = rb_reg_regsub(repl, str, regs, pat);
3359         }
3360
3361         if (OBJ_TAINTED(val)) tainted = 1;
3362
3363         len = beg - offset;     /* copy pre-match substr */
3364         if (len) {
3365             rb_enc_str_buf_cat(dest, cp, len, str_enc);
3366         }
3367
3368         rb_str_buf_append(dest, val);
3369
3370         last = offset;
3371         offset = end0;
3372         if (beg0 == end0) {
3373             /*
3374              * Always consume at least one character of the input string
3375              * in order to prevent infinite loops.
3376              */
3377             if (RSTRING_LEN(str) <= end0) break;
3378             len = rb_enc_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
3379             rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
3380             offset = end0 + len;
3381         }
3382         cp = RSTRING_PTR(str) + offset;
3383         if (offset > RSTRING_LEN(str)) break;
3384         beg = rb_reg_search(pat, str, offset, 0);
3385     } while (beg >= 0);
3386     if (RSTRING_LEN(str) > offset) {
3387         rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
3388     }
3389     rb_reg_search(pat, str, last, 0);
3390     if (bang) {
3391         rb_str_shared_replace(str, dest);
3392     }
3393     else {
3394         RBASIC(dest)->klass = rb_obj_class(str);
3395         OBJ_INFECT(dest, str);
3396         str = dest;
3397     }
3398
3399     if (tainted) OBJ_TAINT(str);
3400     return str;
3401 }
3402
3403
3404 /*
3405  *  call-seq:
3406  *     str.gsub!(pattern, replacement)        => str or nil
3407  *     str.gsub!(pattern) {|match| block }    => str or nil
3408  *
3409  *  Performs the substitutions of <code>String#gsub</code> in place, returning
3410  *  <i>str</i>, or <code>nil</code> if no substitutions were performed.
3411  */
3412
3413 static VALUE
3414 rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
3415 {
3416     return str_gsub(argc, argv, str, 1);
3417 }
3418
3419
3420 /*
3421  *  call-seq:
3422  *     str.gsub(pattern, replacement)       => new_str
3423  *     str.gsub(pattern) {|match| block }   => new_str
3424  *
3425  *  Returns a copy of <i>str</i> with <em>all</em> occurrences of <i>pattern</i>
3426  *  replaced with either <i>replacement</i> or the value of the block. The
3427  *  <i>pattern</i> will typically be a <code>Regexp</code>; if it is a
3428  *  <code>String</code> then no regular expression metacharacters will be
3429  *  interpreted (that is <code>/\d/</code> will match a digit, but
3430  *  <code>'\d'</code> will match a backslash followed by a 'd').
3431  *
3432  *  If a string is used as the replacement, special variables from the match
3433  *  (such as <code>$&</code> and <code>$1</code>) cannot be substituted into it,
3434  *  as substitution into the string occurs before the pattern match
3435  *  starts. However, the sequences <code>\1</code>, <code>\2</code>,
3436  *  <code>\k<group_name></code>, and so on may be used to interpolate
3437  *  successive groups in the match.
3438  *
3439  *  In the block form, the current match string is passed in as a parameter, and
3440  *  variables such as <code>$1</code>, <code>$2</code>, <code>$`</code>,
3441  *  <code>$&</code>, and <code>$'</code> will be set appropriately. The value
3442  *  returned by the block will be substituted for the match on each call.
3443  *
3444  *  The result inherits any tainting in the original string or any supplied
3445  *  replacement string.
3446  *
3447  *     "hello".gsub(/[aeiou]/, '*')                  #=> "h*ll*"
3448  *     "hello".gsub(/([aeiou])/, '<\1>')             #=> "h<e>ll<o>"
3449  *     "hello".gsub(/./) {|s| s[0].ord.to_s + ' '}   #=> "104 101 108 108 111 "
3450  *     "hello".gsub(/(?<foo>[aeiou])/, '{\k<foo>}')  #=> "h{e}ll{o}"
3451  */
3452
3453 static VALUE
3454 rb_str_gsub(int argc, VALUE *argv, VALUE str)
3455 {
3456     return str_gsub(argc, argv, str, 0);
3457 }
3458
3459
3460 /*
3461  *  call-seq:
3462  *     str.replace(other_str)   => str
3463  *
3464  *  Replaces the contents and taintedness of <i>str</i> with the corresponding
3465  *  values in <i>other_str</i>.
3466  *
3467  *     s = "hello"         #=> "hello"
3468  *     s.replace "world"   #=> "world"
3469  */
3470
3471 static VALUE
3472 rb_str_replace(VALUE str, VALUE str2)
3473 {
3474     long len;
3475     if (str == str2) return str;
3476
3477     StringValue(str2);
3478     len = RSTRING_LEN(str2);
3479     if (STR_ASSOC_P(str2)) {
3480         str2 = rb_str_new4(str2);
3481     }
3482     if (str_independent(str) && !STR_EMBED_P(str)) {
3483         xfree(RSTRING_PTR(str));
3484     }
3485     if (STR_SHARED_P(str2)) {
3486         STR_SET_NOEMBED(str);
3487         RSTRING(str)->as.heap.len = len;
3488         RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
3489         FL_SET(str, ELTS_SHARED);
3490         FL_UNSET(str, STR_ASSOC);
3491         RSTRING(str)->as.heap.aux.shared = RSTRING(str2)->as.heap.aux.shared;
3492     }
3493     else {
3494         str_replace_shared(str, rb_str_new4(str2));
3495     }
3496
3497     OBJ_INFECT(str, str2);
3498     rb_enc_cr_str_exact_copy(str, str2);
3499     return str;
3500 }
3501
3502 /*
3503  *  call-seq:
3504  *     string.clear    ->  string
3505  *
3506  *  Makes string empty.
3507  *
3508  *     a = "abcde"
3509  *     a.clear    #=> ""
3510  */
3511
3512 static VALUE
3513 rb_str_clear(VALUE str)
3514 {
3515     /* rb_str_modify() */       /* no need for str_make_independent */
3516     if (str_independent(str) && !STR_EMBED_P(str)) {
3517         xfree(RSTRING_PTR(str));
3518     }
3519     STR_SET_EMBED(str);
3520     STR_SET_EMBED_LEN(str, 0);
3521     RSTRING_PTR(str)[0] = 0;
3522     ENC_CODERANGE_CLEAR(str);
3523     return str;
3524 }
3525
3526 /*
3527  *  call-seq:
3528  *     string.chr    ->  string
3529  *
3530  *  Returns a one-character string at the beginning of the string.
3531  *
3532  *     a = "abcde"
3533  *     a.chr    #=> "a"
3534  */
3535
3536 static VALUE
3537 rb_str_chr(VALUE str)
3538 {
3539     return rb_str_substr(str, 0, 1);
3540 }
3541
3542 /*
3543  *  call-seq:
3544  *     str.getbyte(index)          => 0 .. 255
3545  *
3546  *  returns the <i>index</i>th byte as an integer.
3547  */
3548 static VALUE
3549 rb_str_getbyte(VALUE str, VALUE index)
3550 {
3551     long pos = NUM2LONG(index);
3552
3553     if (pos < 0)
3554         pos += RSTRING_LEN(str);
3555     if (pos < 0 ||  RSTRING_LEN(str) <= pos)
3556         return Qnil;
3557
3558     return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
3559 }
3560
3561 /*
3562  *  call-seq:
3563  *     str.setbyte(index, int) => int
3564  *
3565  *  modifies the <i>index</i>th byte as <i>int</i>.
3566  */
3567 static VALUE
3568 rb_str_setbyte(VALUE str, VALUE index, VALUE value)
3569 {
3570     long pos = NUM2LONG(index);
3571     int byte = NUM2INT(value);
3572
3573     rb_str_modify(str);
3574
3575     if (pos < -RSTRING_LEN(str) || RSTRING_LEN(str) <= pos)
3576         rb_raise(rb_eIndexError, "index %ld out of string", pos);
3577     if (pos < 0)
3578         pos += RSTRING_LEN(str);
3579
3580     RSTRING_PTR(str)[pos] = byte;
3581
3582     return value;
3583 }
3584
3585 /*
3586  *  call-seq:
3587  *     str.reverse   => new_str
3588  *
3589  *  Returns a new string with the characters from <i>str</i> in reverse order.
3590  *
3591  *     "stressed".reverse   #=> "desserts"
3592  */
3593
3594 static VALUE
3595 rb_str_reverse(VALUE str)
3596 {
3597     rb_encoding *enc;
3598     VALUE rev;
3599     char *s, *e, *p;
3600     int single = 1;
3601
3602     if (RSTRING_LEN(str) <= 1) return rb_str_dup(str);
3603     enc = STR_ENC_GET(str);
3604     rev = rb_str_new5(str, 0, RSTRING_LEN(str));
3605     s = RSTRING_PTR(str); e = RSTRING_END(str);
3606     p = RSTRING_END(rev);
3607
3608     if (RSTRING_LEN(str) > 1) {
3609         if (single_byte_optimizable(str)) {
3610             while (s < e) {
3611                 *--p = *s++;
3612             }
3613         }
3614         else {
3615             while (s < e) {
3616                 int clen = rb_enc_mbclen(s, e, enc);
3617
3618                 if (clen > 1 || (*s & 0x80)) single = 0;
3619                 p -= clen;
3620                 memcpy(p, s, clen);
3621                 s += clen;
3622             }
3623         }
3624     }
3625     STR_SET_LEN(rev, RSTRING_LEN(str));
3626     OBJ_INFECT(rev, str);
3627     if (ENC_CODERANGE(str) == ENC_CODERANGE_UNKNOWN) {
3628         if (single) {
3629             ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
3630         }
3631         else {
3632             ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
3633         }
3634     }
3635     rb_enc_cr_str_copy_for_substr(rev, str);
3636
3637     return rev;
3638 }
3639
3640
3641 /*
3642  *  call-seq:
3643  *     str.reverse!   => str
3644  *
3645  *  Reverses <i>str</i> in place.
3646  */
3647
3648 static VALUE
3649 rb_str_reverse_bang(VALUE str)
3650 {
3651     if (RSTRING_LEN(str) > 1) {
3652         if (single_byte_optimizable(str)) {
3653             char *s, *e, c;
3654             int cr = ENC_CODERANGE(str);
3655             int single = 1;
3656
3657             rb_str_modify(str);
3658             s = RSTRING_PTR(str);
3659             e = RSTRING_END(str) - 1;
3660             while (s < e) {
3661                 c = *s;
3662                 if (*s & 0x80) single = 0;
3663                 *s++ = *e;
3664                 *e-- = c;
3665             }
3666             if (cr == ENC_CODERANGE_UNKNOWN && single) {
3667                 cr = ENC_CODERANGE_7BIT;
3668             }
3669             ENC_CODERANGE_SET(str, cr);
3670         }
3671         else {
3672             rb_str_shared_replace(str, rb_str_reverse(str));
3673         }
3674     }
3675     return str;
3676 }
3677
3678
3679 /*
3680  *  call-seq:
3681  *     str.include? other_str   => true or false
3682  *
3683  *  Returns <code>true</code> if <i>str</i> contains the given string or
3684  *  character.
3685  *
3686  *     "hello".include? "lo"   #=> true
3687  *     "hello".include? "ol"   #=> false
3688  *     "hello".include? ?h     #=> true
3689  */
3690
3691 static VALUE
3692 rb_str_include(VALUE str, VALUE arg)
3693 {
3694     long i;
3695
3696     StringValue(arg);
3697     i = rb_str_index(str, arg, 0);
3698
3699     if (i == -1) return Qfalse;
3700     return Qtrue;
3701 }
3702
3703
3704 /*
3705  *  call-seq:
3706  *     str.to_i(base=10)   => integer
3707  *
3708  *  Returns the result of interpreting leading characters in <i>str</i> as an
3709  *  integer base <i>base</i> (between 2 and 36). Extraneous characters past the
3710  *  end of a valid number are ignored. If there is not a valid number at the
3711  *  start of <i>str</i>, <code>0</code> is returned. This method never raises an
3712  *  exception.
3713  *
3714  *     "12345".to_i             #=> 12345
3715  *     "99 red balloons".to_i   #=> 99
3716  *     "0a".to_i                #=> 0
3717  *     "0a".to_i(16)            #=> 10
3718  *     "hello".to_i             #=> 0
3719  *     "1100101".to_i(2)        #=> 101
3720  *     "1100101".to_i(8)        #=> 294977
3721  *     "1100101".to_i(10)       #=> 1100101
3722  *     "1100101".to_i(16)       #=> 17826049
3723  */
3724
3725 static VALUE
3726 rb_str_to_i(int argc, VALUE *argv, VALUE str)
3727 {
3728     int base;
3729
3730     if (argc == 0) base = 10;
3731     else {
3732         VALUE b;
3733
3734         rb_scan_args(argc, argv, "01", &b);
3735         base = NUM2INT(b);
3736     }
3737     if (base < 0) {
3738         rb_raise(rb_eArgError, "invalid radix %d", base);
3739     }
3740     return rb_str_to_inum(str, base, Qfalse);
3741 }
3742
3743
3744 /*
3745  *  call-seq:
3746  *     str.to_f   => float
3747  *
3748  *  Returns the result of interpreting leading characters in <i>str</i> as a
3749  *  floating point number. Extraneous characters past the end of a valid number
3750  *  are ignored. If there is not a valid number at the start of <i>str</i>,
3751  *  <code>0.0</code> is returned. This method never raises an exception.
3752  *
3753  *     "123.45e1".to_f        #=> 1234.5
3754  *     "45.67 degrees".to_f   #=> 45.67
3755  *     "thx1138".to_f         #=> 0.0
3756  */
3757
3758 static VALUE
3759 rb_str_to_f(VALUE str)
3760 {
3761     return DOUBLE2NUM(rb_str_to_dbl(str, Qfalse));
3762 }
3763
3764
3765 /*
3766  *  call-seq:
3767  *     str.to_s     => str
3768  *     str.to_str   => str
3769  *
3770  *  Returns the receiver.
3771  */
3772
3773 static VALUE
3774 rb_str_to_s(VALUE str)
3775 {
3776     if (rb_obj_class(str) != rb_cString) {
3777         VALUE dup = str_alloc(rb_cString);
3778         rb_str_replace(dup, str);
3779         return dup;
3780     }
3781     return str;
3782 }
3783
3784 static void
3785 str_cat_char(VALUE str, int c, rb_encoding *enc)
3786 {
3787     char s[16];
3788     int n = rb_enc_codelen(c, enc);
3789
3790     rb_enc_mbcput(c, s, enc);
3791     rb_enc_str_buf_cat(str, s, n, enc);
3792 }
3793
3794 static void
3795 prefix_escape(VALUE str, int c, rb_encoding *enc)
3796 {
3797     str_cat_char(str, '\\', enc);
3798     str_cat_char(str, c, enc);
3799 }
3800
3801 /*
3802  * call-seq:
3803  *   str.inspect   => string
3804  *
3805  * Returns a printable version of _str_, surrounded by quote marks,
3806  * with special characters escaped.
3807  *
3808  *    str = "hello"
3809  *    str[3] = "\b"
3810  *    str.inspect       #=> "\"hel\\bo\""
3811  */
3812
3813 VALUE
3814 rb_str_inspect(VALUE str)
3815 {
3816     rb_encoding *enc = STR_ENC_GET(str);
3817     char *p, *pend;
3818     VALUE result = rb_str_buf_new(0);
3819
3820     if (!rb_enc_asciicompat(enc)) enc = rb_usascii_encoding();
3821     rb_enc_associate(result, enc);
3822     str_cat_char(result, '"', enc);
3823     p = RSTRING_PTR(str); pend = RSTRING_END(str);
3824     while (p < pend) {
3825         int c;
3826         int n;
3827         int cc;
3828
3829         n = rb_enc_precise_mbclen(p, pend, enc);
3830         if (!MBCLEN_CHARFOUND_P(n)) {
3831             p++;
3832             n = 1;
3833             goto escape_codepoint;
3834         }
3835         n = MBCLEN_CHARFOUND_LEN(n);
3836
3837         c = rb_enc_codepoint(p, pend, enc);
3838         n = rb_enc_codelen(c, enc);
3839
3840         p += n;
3841         if (c == '"'|| c == '\\' ||
3842             (c == '#' &&
3843              p < pend &&
3844              MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p,pend,enc)) &&
3845              (cc = rb_enc_codepoint(p,pend,enc),
3846               (cc == '$' || cc == '@' || cc == '{')))) {
3847             prefix_escape(result, c, enc);
3848         }
3849         else if (c == '\n') {
3850             prefix_escape(result, 'n', enc);
3851         }
3852         else if (c == '\r') {
3853             prefix_escape(result, 'r', enc);
3854         }
3855         else if (c == '\t') {
3856             prefix_escape(result, 't', enc);
3857         }
3858         else if (c == '\f') {
3859             prefix_escape(result, 'f', enc);
3860         }
3861         else if (c == '\013') {
3862             prefix_escape(result, 'v', enc);
3863         }
3864         else if (c == '\010') {
3865             prefix_escape(result, 'b', enc);
3866         }
3867         else if (c == '\007') {
3868             prefix_escape(result, 'a', enc);
3869         }
3870         else if (c == 033) {
3871             prefix_escape(result, 'e', enc);
3872         }
3873         else if (rb_enc_isprint(c, enc)) {
3874             rb_enc_str_buf_cat(result, p-n, n, enc);
3875         }
3876         else {
3877             char buf[5];
3878             char *s;
3879             char *q;
3880
3881           escape_codepoint:
3882             for (q = p-n; q < p; q++) {
3883                 s = buf;
3884                 sprintf(buf, "\\x%02X", *q & 0377);
3885                 while (*s) {
3886                     str_cat_char(result, *s++, enc);
3887                 }
3888             }
3889         }
3890     }
3891     str_cat_char(result, '"', enc);
3892
3893     OBJ_INFECT(result, str);
3894     return result;
3895 }
3896
3897 #define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
3898
3899 /*
3900  *  call-seq:
3901  *     str.dump   => new_str
3902  *
3903  *  Produces a version of <i>str</i> with all nonprinting characters replaced by
3904  *  <code>\nnn</code> notation and all special characters escaped.
3905  */
3906
3907 VALUE
3908 rb_str_dump(VALUE str)
3909 {
3910     rb_encoding *enc0 = rb_enc_get(str);
3911     long len;
3912     const char *p, *pend;
3913     char *q, *qend;
3914     VALUE result;
3915
3916     len = 2;                    /* "" */
3917     p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
3918     while (p < pend) {
3919         unsigned char c = *p++;
3920         switch (c) {
3921           case '"':  case '\\':
3922           case '\n': case '\r':
3923           case '\t': case '\f':
3924           case '\013': case '\010': case '\007': case '\033':
3925             len += 2;
3926             break;
3927
3928           case '#':
3929             len += IS_EVSTR(p, pend) ? 2 : 1;
3930             break;
3931
3932           default:
3933             if (ISPRINT(c)) {
3934                 len++;
3935             }
3936             else {
3937                 len += 4;               /* \xNN */
3938             }
3939             break;
3940         }
3941     }
3942     if (!rb_enc_asciicompat(enc0)) {
3943         len += 19;              /* ".force_encoding('')" */
3944         len += strlen(enc0->name);
3945     }
3946
3947     result = rb_str_new5(str, 0, len);
3948     p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
3949     q = RSTRING_PTR(result); qend = q + len;
3950
3951     *q++ = '"';
3952     while (p < pend) {
3953         unsigned char c = *p++;
3954
3955         if (c == '"' || c == '\\') {
3956             *q++ = '\\';
3957             *q++ = c;
3958         }
3959         else if (c == '#') {
3960             if (IS_EVSTR(p, pend)) *q++ = '\\';
3961             *q++ = '#';
3962         }
3963         else if (c == '\n') {
3964             *q++ = '\\';
3965             *q++ = 'n';
3966         }
3967         else if (c == '\r') {
3968             *q++ = '\\';
3969             *q++ = 'r';
3970         }
3971         else if (c == '\t') {
3972             *q++ = '\\';
3973             *q++ = 't';
3974         }
3975         else if (c == '\f') {
3976             *q++ = '\\';
3977             *q++ = 'f';
3978         }
3979         else if (c == '\013') {
3980             *q++ = '\\';
3981             *q++ = 'v';
3982         }
3983         else if (c == '\010') {
3984             *q++ = '\\';
3985             *q++ = 'b';
3986         }
3987         else if (c == '\007') {
3988             *q++ = '\\';
3989             *q++ = 'a';
3990         }
3991         else if (c == '\033') {
3992             *q++ = '\\';
3993             *q++ = 'e';
3994         }
3995         else if (ISPRINT(c)) {
3996             *q++ = c;
3997         }
3998         else {
3999             *q++ = '\\';
4000             sprintf(q, "x%02X", c);
4001             q += 3;
4002         }
4003     }
4004     *q++ = '"';
4005     if (!rb_enc_asciicompat(enc0)) {
4006         sprintf(q, ".force_encoding(\"%s\")", enc0->name);
4007         enc0 = rb_ascii8bit_encoding();
4008     }
4009
4010     OBJ_INFECT(result, str);
4011     /* result from dump is ASCII */
4012     rb_enc_associate(result, enc0);
4013     return result;
4014 }
4015
4016
4017 /*
4018  *  call-seq:
4019  *     str.upcase!   => str or nil
4020  *
4021  *  Upcases the contents of <i>str</i>, returning <code>nil</code> if no changes
4022  *  were made.
4023  *  Note: case replacement is effective only in ASCII region.
4024  */
4025
4026 static VALUE
4027 rb_str_upcase_bang(VALUE str)
4028 {
4029     rb_encoding *enc;
4030     char *s, *send;
4031     int modify = 0;
4032     int cr = ENC_CODERANGE(str);
4033
4034     rb_str_modify(str);
4035     enc = STR_ENC_GET(str);
4036     s = RSTRING_PTR(str); send = RSTRING_END(str);
4037     while (s < send) {
4038         int c = rb_enc_codepoint(s, send, enc);
4039
4040         if (rb_enc_islower(c, enc)) {
4041             /* assuming toupper returns codepoint with same size */
4042             rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
4043             modify = 1;
4044         }
4045         s += rb_enc_codelen(c, enc);
4046     }
4047
4048     ENC_CODERANGE_SET(str, cr);
4049     if (modify) return str;
4050     return Qnil;
4051 }
4052
4053
4054 /*
4055  *  call-seq:
4056  *     str.upcase   => new_str
4057  *
4058  *  Returns a copy of <i>str</i> with all lowercase letters replaced with their
4059  *  uppercase counterparts. The operation is locale insensitive---only
4060  *  characters ``a'' to ``z'' are affected.
4061  *  Note: case replacement is effective only in ASCII region.
4062  *
4063  *     "hEllO".upcase   #=> "HELLO"
4064  */
4065
4066 static VALUE
4067 rb_str_upcase(VALUE str)
4068 {
4069     str = rb_str_dup(str);
4070     rb_str_upcase_bang(str);
4071     return str;
4072 }
4073
4074
4075 /*
4076  *  call-seq:
4077  *     str.downcase!   => str or nil
4078  *
4079  *  Downcases the contents of <i>str</i>, returning <code>nil</code> if no
4080  *  changes were made.
4081  *  Note: case replacement is effective only in ASCII region.
4082  */
4083
4084 static VALUE
4085 rb_str_downcase_bang(VALUE str)
4086 {
4087     rb_encoding *enc;
4088     char *s, *send;
4089     int modify = 0;
4090     int cr = ENC_CODERANGE(str);
4091
4092     rb_str_modify(str);
4093     enc = STR_ENC_GET(str);
4094     s = RSTRING_PTR(str); send = RSTRING_END(str);
4095     while (s < send) {
4096         int c = rb_enc_codepoint(s, send, enc);
4097
4098         if (rb_enc_isupper(c, enc)) {
4099             /* assuming toupper returns codepoint with same size */
4100             rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
4101             modify = 1;
4102         }
4103         s += rb_enc_codelen(c, enc);
4104     }
4105
4106     ENC_CODERANGE_SET(str, cr);
4107     if (modify) return str;
4108     return Qnil;
4109 }
4110
4111
4112 /*
4113  *  call-seq:
4114  *     str.downcase   => new_str
4115  *
4116  *  Returns a copy of <i>str</i> with all uppercase letters replaced with their
4117  *  lowercase counterparts. The operation is locale insensitive---only
4118  *  characters ``A'' to ``Z'' are affected.
4119  *  Note: case replacement is effective only in ASCII region.
4120  *
4121  *     "hEllO".downcase   #=> "hello"
4122  */
4123
4124 static VALUE
4125 rb_str_downcase(VALUE str)
4126 {
4127     str = rb_str_dup(str);
4128     rb_str_downcase_bang(str);
4129     return str;
4130 }
4131
4132
4133 /*
4134  *  call-seq:
4135  *     str.capitalize!   => str or nil
4136  *
4137  *  Modifies <i>str</i> by converting the first character to uppercase and the
4138  *  remainder to lowercase. Returns <code>nil</code> if no changes are made.
4139  *  Note: case conversion is effective only in ASCII region.
4140  *
4141  *     a = "hello"
4142  *     a.capitalize!   #=> "Hello"
4143  *     a               #=> "Hello"
4144  *     a.capitalize!   #=> nil
4145  */
4146
4147 static VALUE
4148 rb_str_capitalize_bang(VALUE str)
4149 {
4150     rb_encoding *enc;
4151     char *s, *send;
4152     int modify = 0;
4153     int c;
4154     int cr = ENC_CODERANGE(str);
4155
4156     rb_str_modify(str);
4157     enc = STR_ENC_GET(str);
4158     if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
4159     s = RSTRING_PTR(str); send = RSTRING_END(str);
4160
4161     c = rb_enc_codepoint(s, send, enc);
4162     if (rb_enc_islower(c, enc)) {
4163         rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
4164         modify = 1;
4165     }
4166     s += rb_enc_codelen(c, enc);
4167     while (s < send) {
4168         c = rb_enc_codepoint(s, send, enc);
4169         if (rb_enc_isupper(c, enc)) {
4170             rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
4171             modify = 1;
4172         }
4173         s += rb_enc_codelen(c, enc);
4174     }
4175
4176     ENC_CODERANGE_SET(str, cr);
4177     if (modify) return str;
4178     return Qnil;
4179 }
4180
4181
4182 /*
4183  *  call-seq:
4184  *     str.capitalize   => new_str
4185  *
4186  *  Returns a copy of <i>str</i> with the first character converted to uppercase
4187  *  and the remainder to lowercase.
4188  *  Note: case conversion is effective only in ASCII region.
4189  *
4190  *     "hello".capitalize    #=> "Hello"
4191  *     "HELLO".capitalize    #=> "Hello"
4192  *     "123ABC".capitalize   #=> "123abc"
4193  */
4194
4195 static VALUE
4196 rb_str_capitalize(VALUE str)
4197 {
4198     str = rb_str_dup(str);
4199     rb_str_capitalize_bang(str);
4200     return str;
4201 }
4202
4203
4204 /*
4205  *  call-seq:
4206 *     str.swapcase!   => str or nil
4207  *
4208  *  Equivalent to <code>String#swapcase</code>, but modifies the receiver in
4209  *  place, returning <i>str</i>, or <code>nil</code> if no changes were made.
4210  *  Note: case conversion is effective only in ASCII region.
4211  */
4212
4213 static VALUE
4214 rb_str_swapcase_bang(VALUE str)
4215 {
4216     rb_encoding *enc;
4217     char *s, *send;
4218     int modify = 0;
4219     int cr = ENC_CODERANGE(str);
4220
4221     rb_str_modify(str);
4222     enc = STR_ENC_GET(str);
4223     s = RSTRING_PTR(str); send = RSTRING_END(str);
4224     while (s < send) {
4225         int c = rb_enc_codepoint(s, send, enc);
4226
4227         if (rb_enc_isupper(c, enc)) {
4228             /* assuming toupper returns codepoint with same size */
4229             rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
4230             modify = 1;
4231         }
4232         else if (rb_enc_islower(c, enc)) {
4233             /* assuming toupper returns codepoint with same size */
4234             rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
4235             modify = 1;
4236         }
4237         s += rb_enc_codelen(c, enc);
4238     }
4239
4240     ENC_CODERANGE_SET(str, cr);
4241     if (modify) return str;
4242     return Qnil;
4243 }
4244
4245
4246 /*
4247  *  call-seq:
4248  *     str.swapcase   => new_str
4249  *
4250  *  Returns a copy of <i>str</i> with uppercase alphabetic characters converted
4251  *  to lowercase and lowercase characters converted to uppercase.
4252  *  Note: case conversion is effective only in ASCII region.
4253  *
4254  *     "Hello".swapcase          #=> "hELLO"
4255  *     "cYbEr_PuNk11".swapcase   #=> "CyBeR_pUnK11"
4256  */
4257
4258 static VALUE
4259 rb_str_swapcase(VALUE str)
4260 {
4261     str = rb_str_dup(str);
4262     rb_str_swapcase_bang(str);
4263     return str;
4264 }
4265
4266 typedef unsigned char *USTR;
4267
4268 struct tr {
4269     int gen, now, max;
4270     char *p, *pend;
4271 };
4272
4273 static int
4274 trnext(struct tr *t, rb_encoding *enc)
4275 {
4276     for (;;) {
4277         if (!t->gen) {
4278             if (t->p == t->pend) return -1;
4279             if (t->p < t->pend - 1 && *t->p == '\\') {
4280                 t->p++;
4281             }
4282             t->now = rb_enc_codepoint(t->p, t->pend, enc);
4283             t->p += rb_enc_codelen(t->now, enc);
4284             if (t->p < t->pend - 1 && *t->p == '-') {
4285                 t->p++;
4286                 if (t->p < t->pend) {
4287                     int c = rb_enc_codepoint(t->p, t->pend, enc);
4288                     t->p += rb_enc_codelen(c, enc);
4289                     if (t->now > c) continue;
4290                     t->gen = 1;
4291                     t->max = c;
4292                 }
4293             }
4294             return t->now;
4295         }
4296         else if (++t->now < t->max) {
4297             return t->now;
4298         }
4299         else {
4300             t->gen = 0;
4301             return t->max;
4302         }
4303     }
4304 }
4305
4306 static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
4307
4308 static VALUE
4309 tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
4310 {
4311     int trans[256];
4312     rb_encoding *enc, *e1, *e2;
4313     struct tr trsrc, trrepl;
4314     int cflag = 0;
4315     int c, c0, last = 0, modify = 0, i, l;
4316     char *s, *send;
4317     VALUE hash = 0;
4318     int singlebyte = single_byte_optimizable(str);
4319
4320     StringValue(src);
4321     StringValue(repl);
4322     if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
4323     if (RSTRING_LEN(repl) == 0) {
4324         return rb_str_delete_bang(1, &src, str);
4325     }
4326
4327     e1 = rb_enc_check(str, src);
4328     e2 = rb_enc_check(str, repl);
4329     if (e1 == e2) {
4330         enc = e1;
4331     }
4332     else {
4333         enc = rb_enc_check(src, repl);
4334     }
4335     trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
4336     if (RSTRING_LEN(src) > 1 &&
4337         rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
4338         trsrc.p + l < trsrc.pend) {
4339         cflag = 1;
4340         trsrc.p += l;
4341     }
4342     trrepl.p = RSTRING_PTR(repl);
4343     trrepl.pend = trrepl.p + RSTRING_LEN(repl);
4344     trsrc.gen = trrepl.gen = 0;
4345     trsrc.now = trrepl.now = 0;
4346     trsrc.max = trrepl.max = 0;
4347
4348     if (cflag) {
4349         for (i=0; i<256; i++) {
4350             trans[i] = 1;
4351         }
4352         while ((c = trnext(&trsrc, enc)) >= 0) {
4353             if (c < 256) {
4354                 trans[c] = -1;
4355             }
4356             else {
4357                 if (!hash) hash = rb_hash_new();
4358                 rb_hash_aset(hash, INT2NUM(c), Qtrue);
4359             }
4360         }
4361         while ((c = trnext(&trrepl, enc)) >= 0)
4362             /* retrieve last replacer */;
4363         last = trrepl.now;
4364         for (i=0; i<256; i++) {
4365             if (trans[i] >= 0) {
4366                 trans[i] = last;
4367             }
4368         }
4369     }
4370     else {
4371         int r;
4372
4373         for (i=0; i<256; i++) {
4374             trans[i] = -1;
4375         }
4376         while ((c = trnext(&trsrc, enc)) >= 0) {
4377             r = trnext(&trrepl, enc);
4378             if (r == -1) r = trrepl.now;
4379             if (c < 256) {
4380                 trans[c] = r;
4381                 if (r > 255) singlebyte = 0;
4382             }
4383             else {
4384                 if (!hash) hash = rb_hash_new();
4385                 rb_hash_aset(hash, INT2NUM(c), INT2NUM(r));
4386             }
4387         }
4388     }
4389
4390     rb_str_modify(str);
4391     s = RSTRING_PTR(str); send = RSTRING_END(str);
4392     if (sflag) {
4393         int clen, tlen, max = RSTRING_LEN(str);
4394         int offset, save = -1;
4395         char *buf = ALLOC_N(char, max), *t = buf;
4396
4397         while (s < send) {
4398             c0 = c = rb_enc_codepoint(s, send, enc);
4399             tlen = clen = rb_enc_codelen(c, enc);
4400
4401             s += clen;
4402             if (c < 256) {
4403                 c = trans[c];
4404             }
4405             else if (hash) {
4406                 VALUE tmp = rb_hash_lookup(hash, INT2NUM(c));
4407                 if (NIL_P(tmp)) {
4408                     if (cflag) c = last;
4409                     else c = -1;
4410                 }
4411                 else if (cflag) c = -1;
4412                 else c = NUM2INT(tmp);
4413             }
4414             else {
4415                 c = -1;
4416             }
4417             if (c >= 0) {
4418                 if (save == c) continue;
4419                 save = c;
4420                 tlen = rb_enc_codelen(c, enc);
4421                 modify = 1;
4422             }
4423             else {
4424                 save = -1;
4425                 c = c0;
4426             }
4427             while (t - buf + tlen >= max) {
4428                 offset = t - buf;
4429                 max *= 2;
4430                 REALLOC_N(buf, char, max);
4431                 t = buf + offset;
4432             }
4433             rb_enc_mbcput(c, t, enc);
4434             t += tlen;
4435         }
4436         *t = '\0';
4437         RSTRING(str)->as.heap.ptr = buf;
4438         RSTRING(str)->as.heap.len = t - buf;
4439         STR_SET_NOEMBED(str);
4440         RSTRING(str)->as.heap.aux.capa = max;
4441     }
4442     else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
4443         while (s < send) {
4444             c = (unsigned char)*s;
4445             if (trans[c] >= 0) {
4446                 if (!cflag) {
4447                     c = trans[c];
4448                     *s = c;
4449                     modify = 1;
4450                 }
4451                 else {
4452                     *s = last;
4453                     modify = 1;
4454                 }
4455             }
4456             s++;
4457         }
4458     }
4459     else {
4460         int clen, tlen, max = RSTRING_LEN(str) * 1.2;
4461         int offset;
4462         char *buf = ALLOC_N(char, max), *t = buf;
4463
4464         while (s < send) {
4465             c0 = c = rb_enc_codepoint(s, send, enc);
4466             tlen = clen = rb_enc_codelen(c, enc);
4467
4468             if (c < 256) {
4469                 c = trans[c];
4470             }
4471             else if (hash) {
4472                 VALUE tmp = rb_hash_lookup(hash, INT2NUM(c));
4473                 if (NIL_P(tmp)) {
4474                     if (cflag) c = last;
4475                     else c = -1;
4476                 }
4477                 else if (cflag) c = -1;
4478                 else c = NUM2INT(tmp);
4479             }
4480             else {
4481                 c = -1;
4482             }
4483             if (c >= 0) {
4484                 tlen = rb_enc_codelen(c, enc);
4485                 modify = 1;
4486             }
4487             else {
4488                 modify = 1;
4489                 c = c0;
4490             }
4491             while (t - buf + tlen >= max) {
4492                 offset = t - buf;
4493                 max *= 2;
4494                 REALLOC_N(buf, char, max);
4495                 t = buf + offset;
4496             }
4497             if (s != t) rb_enc_mbcput(c, t, enc);
4498             s += clen;
4499             t += tlen;
4500         }
4501         if (!STR_EMBED_P(str)) {
4502             xfree(RSTRING(str)->as.heap.ptr);
4503         }
4504         *t = '\0';
4505         RSTRING(str)->as.heap.ptr = buf;
4506         RSTRING(str)->as.heap.len = t - buf;
4507         STR_SET_NOEMBED(str);
4508         RSTRING(str)->as.heap.aux.capa = max;
4509     }
4510
4511     if (modify) {
4512         rb_enc_associate(str, enc);
4513         return str;
4514     }
4515     return Qnil;
4516 }
4517
4518
4519 /*
4520  *  call-seq:
4521  *     str.tr!(from_str, to_str)   => str or nil
4522  *
4523  *  Translates <i>str</i> in place, using the same rules as
4524  *  <code>String#tr</code>. Returns <i>str</i>, or <code>nil</code> if no
4525  *  changes were made.
4526  */
4527
4528 static VALUE
4529 rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
4530 {
4531     return tr_trans(str, src, repl, 0);
4532 }
4533
4534
4535 /*
4536  *  call-seq:
4537  *     str.tr(from_str, to_str)   => new_str
4538  *
4539  *  Returns a copy of <i>str</i> with the characters in <i>from_str</i> replaced
4540  *  by the corresponding characters in <i>to_str</i>. If <i>to_str</i> is
4541  *  shorter than <i>from_str</i>, it is padded with its last character. Both
4542  *  strings may use the c1--c2 notation to denote ranges of characters, and
4543  *  <i>from_str</i> may start with a <code>^</code>, which denotes all
4544  *  characters except those listed.
4545  *
4546  *     "hello".tr('aeiou', '*')    #=> "h*ll*"
4547  *     "hello".tr('^aeiou', '*')   #=> "*e**o"
4548  *     "hello".tr('el', 'ip')      #=> "hippo"
4549  *     "hello".tr('a-y', 'b-z')    #=> "ifmmp"
4550  */
4551
4552 static VALUE
4553 rb_str_tr(VALUE str, VALUE src, VALUE repl)
4554 {
4555     str = rb_str_dup(str);
4556     tr_trans(str, src, repl, 0);
4557     return str;
4558 }
4559
4560 static void
4561 tr_setup_table(VALUE str, char stable[256], int first,
4562                VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
4563 {
4564     char buf[256];
4565     struct tr tr;
4566     int c, l;
4567     VALUE table = 0, ptable = 0;
4568     int i, cflag = 0;
4569
4570     tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
4571     tr.gen = tr.now = tr.max = 0;
4572
4573     if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
4574         cflag = 1;
4575         tr.p += l;
4576     }
4577     if (first) {
4578         for (i=0; i<256; i++) {
4579             stable[i] = 1;
4580         }
4581     }
4582     for (i=0; i<256; i++) {
4583         buf[i] = cflag;
4584     }
4585
4586     while ((c = trnext(&tr, enc)) >= 0) {
4587         if (c < 256) {
4588             buf[c & 0xff] = !cflag;
4589         }
4590         else {
4591             VALUE key = INT2NUM(c);
4592
4593             if (!table) {
4594                 table = rb_hash_new();
4595                 if (cflag) {
4596                     ptable = *ctablep;
4597                     *ctablep = table;
4598                 }
4599                 else {
4600                     ptable = *tablep;
4601                     *tablep = table;
4602                 }
4603             }
4604             if (!ptable || !NIL_P(rb_hash_aref(ptable, key))) {
4605                 rb_hash_aset(table, key, Qtrue);
4606             }
4607         }
4608     }
4609     for (i=0; i<256; i++) {
4610         stable[i] = stable[i] && buf[i];
4611     }
4612 }
4613
4614
4615 static int
4616 tr_find(int c, char table[256], VALUE del, VALUE nodel)
4617 {
4618     if (c < 256) {
4619         return table[c] ? Qtrue : Qfalse;
4620     }
4621     else {
4622         VALUE v = INT2NUM(c);
4623
4624         if (del && !NIL_P(rb_hash_lookup(del, v))) {
4625             if (!nodel || NIL_P(rb_hash_lookup(nodel, v))) {
4626                 return Qtrue;
4627             }
4628         }
4629         return Qfalse;
4630     }
4631 }
4632
4633 /*
4634  *  call-seq:
4635  *     str.delete!([other_str]+)   => str or nil
4636  *
4637  *  Performs a <code>delete</code> operation in place, returning <i>str</i>, or
4638  *  <code>nil</code> if <i>str</i> was not modified.
4639  */
4640
4641 static VALUE
4642 rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
4643 {
4644     char squeez[256];
4645     rb_encoding *enc = 0;
4646     char *s, *send, *t;
4647     VALUE del = 0, nodel = 0;
4648     int modify = 0;
4649     int i;
4650     int cr;
4651
4652     if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
4653     cr = ENC_CODERANGE(str);
4654     if (argc < 1) {
4655         rb_raise(rb_eArgError, "wrong number of arguments");
4656     }
4657     for (i=0; i<argc; i++) {
4658         VALUE s = argv[i];
4659
4660         StringValue(s);
4661         enc = rb_enc_check(str, s);
4662         tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
4663     }
4664
4665     rb_str_modify(str);
4666     s = t = RSTRING_PTR(str);
4667     if (!s || RSTRING_LEN(str) == 0) return Qnil;
4668     send = RSTRING_END(str);
4669     while (s < send) {
4670         int c = rb_enc_codepoint(s, send, enc);
4671         int clen = rb_enc_codelen(c, enc);
4672
4673         if (tr_find(c, squeez, del, nodel)) {
4674             modify = 1;
4675         }
4676         else {
4677             if (t != s) rb_enc_mbcput(c, t, enc);
4678             t += clen;
4679         }
4680         s += clen;
4681     }
4682     *t = '\0';
4683     STR_SET_LEN(str, t - RSTRING_PTR(str));
4684
4685     ENC_CODERANGE_SET(str, cr);
4686     if (modify) return str;
4687     return Qnil;
4688 }
4689
4690
4691 /*
4692  *  call-seq:
4693  *     str.delete([other_str]+)   => new_str
4694  *
4695  *  Returns a copy of <i>str</i> with all characters in the intersection of its
4696  *  arguments deleted. Uses the same rules for building the set of characters as
4697  *  <code>String#count</code>.
4698  *
4699  *     "hello".delete "l","lo"        #=> "heo"
4700  *     "hello".delete "lo"            #=> "he"
4701  *     "hello".delete "aeiou", "^e"   #=> "hell"
4702  *     "hello".delete "ej-m"          #=> "ho"
4703  */
4704
4705 static VALUE
4706 rb_str_delete(int argc, VALUE *argv, VALUE str)
4707 {
4708     str = rb_str_dup(str);
4709     rb_str_delete_bang(argc, argv, str);
4710     return str;
4711 }
4712
4713
4714 /*
4715  *  call-seq:
4716  *     str.squeeze!([other_str]*)   => str or nil
4717  *
4718  *  Squeezes <i>str</i> in place, returning either <i>str</i>, or
4719  *  <code>nil</code> if no changes were made.
4720  */
4721
4722 static VALUE
4723 rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
4724 {
4725     char squeez[256];
4726     rb_encoding *enc = 0;
4727     VALUE del = 0, nodel = 0;
4728     char *s, *send, *t;
4729     int save, modify = 0;
4730     int i;
4731
4732     if (argc == 0) {
4733         enc = STR_ENC_GET(str);
4734     }
4735     else {
4736         for (i=0; i<argc; i++) {
4737             VALUE s = argv[i];
4738
4739             StringValue(s);
4740             enc = rb_enc_check(str, s);
4741             tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
4742         }
4743     }
4744
4745     rb_str_modify(str);
4746     s = t = RSTRING_PTR(str);
4747     if (!s || RSTRING_LEN(str) == 0) return Qnil;
4748     send = RSTRING_END(str);
4749     save = -1;
4750     while (s < send) {
4751         int c = rb_enc_codepoint(s, send, enc);
4752         int clen = rb_enc_codelen(c, enc);
4753
4754         if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
4755             if (t != s) rb_enc_mbcput(c, t, enc);
4756             save = c;
4757             t += clen;
4758         }
4759         s += clen;
4760     }
4761     *t = '\0';
4762     if (t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
4763         STR_SET_LEN(str, t - RSTRING_PTR(str));
4764         modify = 1;
4765     }
4766
4767     if (modify) return str;
4768     return Qnil;
4769 }
4770
4771
4772 /*
4773  *  call-seq:
4774  *     str.squeeze([other_str]*)    => new_str
4775  *
4776  *  Builds a set of characters from the <i>other_str</i> parameter(s) using the
4777  *  procedure described for <code>String#count</code>. Returns a new string
4778  *  where runs of the same character that occur in this set are replaced by a
4779  *  single character. If no arguments are given, all runs of identical
4780  *  characters are replaced by a single character.
4781  *
4782  *     "yellow moon".squeeze                  #=> "yelow mon"
4783  *     "  now   is  the".squeeze(" ")         #=> " now is the"
4784  *     "putters shoot balls".squeeze("m-z")   #=> "puters shot balls"
4785  */
4786
4787 static VALUE
4788 rb_str_squeeze(int argc, VALUE *argv, VALUE str)
4789 {
4790     str = rb_str_dup(str);
4791     rb_str_squeeze_bang(argc, argv, str);
4792     return str;
4793 }
4794
4795
4796 /*
4797  *  call-seq:
4798  *     str.tr_s!(from_str, to_str)   => str or nil
4799  *
4800  *  Performs <code>String#tr_s</code> processing on <i>str</i> in place,
4801  *  returning <i>str</i>, or <code>nil</code> if no changes were made.
4802  */
4803
4804 static VALUE
4805 rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
4806 {
4807     return tr_trans(str, src, repl, 1);
4808 }
4809
4810
4811 /*
4812  *  call-seq:
4813  *     str.tr_s(from_str, to_str)   => new_str
4814  *
4815  *  Processes a copy of <i>str</i> as described under <code>String#tr</code>,
4816  *  then removes duplicate characters in regions that were affected by the
4817  *  translation.
4818  *
4819  *     "hello".tr_s('l', 'r')     #=> "hero"
4820  *     "hello".tr_s('el', '*')    #=> "h*o"
4821  *     "hello".tr_s('el', 'hx')   #=> "hhxo"
4822  */
4823
4824 static VALUE
4825 rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
4826 {
4827     str = rb_str_dup(str);
4828     tr_trans(str, src, repl, 1);
4829     return str;
4830 }
4831
4832
4833 /*
4834  *  call-seq:
4835  *     str.count([other_str]+)   => fixnum
4836  *
4837  *  Each <i>other_str</i> parameter defines a set of characters to count.  The
4838  *  intersection of these sets defines the characters to count in
4839  *  <i>str</i>. Any <i>other_str</i> that starts with a caret (^) is
4840  *  negated. The sequence c1--c2 means all characters between c1 and c2.
4841  *
4842  *     a = "hello world"
4843  *     a.count "lo"            #=> 5
4844  *     a.count "lo", "o"       #=> 2
4845  *     a.count "hello", "^l"   #=> 4
4846  *     a.count "ej-m"          #=> 4
4847  */
4848
4849 static VALUE
4850 rb_str_count(int argc, VALUE *argv, VALUE str)
4851 {
4852     char table[256];
4853     rb_encoding *enc = 0;
4854     VALUE del = 0, nodel = 0;
4855     char *s, *send;
4856     int i;
4857
4858     if (argc < 1) {
4859         rb_raise(rb_eArgError, "wrong number of arguments");
4860     }
4861     for (i=0; i<argc; i++) {
4862         VALUE s = argv[i];
4863
4864         StringValue(s);
4865         enc = rb_enc_check(str, s);
4866         tr_setup_table(s, table,i==0, &del, &nodel, enc);
4867     }
4868
4869     s = RSTRING_PTR(str);
4870     if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
4871     send = RSTRING_END(str);
4872     i = 0;
4873     while (s < send) {
4874         int c = rb_enc_codepoint(s, send, enc);
4875         int clen = rb_enc_codelen(c, enc);
4876
4877         if (tr_find(c, table, del, nodel)) {
4878             i++;
4879         }
4880         s += clen;
4881     }
4882     return INT2NUM(i);
4883 }
4884
4885
4886 /*
4887  *  call-seq:
4888  *     str.split(pattern=$;, [limit])   => anArray
4889  *
4890  *  Divides <i>str</i> into substrings based on a delimiter, returning an array
4891  *  of these substrings.
4892  *
4893  *  If <i>pattern</i> is a <code>String</code>, then its contents are used as
4894  *  the delimiter when splitting <i>str</i>. If <i>pattern</i> is a single
4895  *  space, <i>str</i> is split on whitespace, with leading whitespace and runs
4896  *  of contiguous whitespace characters ignored.
4897  *
4898  *  If <i>pattern</i> is a <code>Regexp</code>, <i>str</i> is divided where the
4899  *  pattern matches. Whenever the pattern matches a zero-length string,
4900  *  <i>str</i> is split into individual characters. If <i>pattern</i> contains
4901  *  groups, the respective matches will be returned in the array as well.
4902  *
4903  *  If <i>pattern</i> is omitted, the value of <code>$;</code> is used.  If
4904  *  <code>$;</code> is <code>nil</code> (which is the default), <i>str</i> is
4905  *  split on whitespace as if ` ' were specified.
4906  *
4907  *  If the <i>limit</i> parameter is omitted, trailing null fields are
4908  *  suppressed. If <i>limit</i> is a positive number, at most that number of
4909  *  fields will be returned (if <i>limit</i> is <code>1</code>, the entire
4910  *  string is returned as the only entry in an array). If negative, there is no
4911  *  limit to the number of fields returned, and trailing null fields are not
4912  *  suppressed.
4913  *
4914  *     " now's  the time".split        #=> ["now's", "the", "time"]
4915  *     " now's  the time".split(' ')   #=> ["now's", "the", "time"]
4916  *     " now's  the time".split(/ /)   #=> ["", "now's", "", "the", "time"]
4917  *     "1, 2.34,56, 7".split(%r{,\s*}) #=> ["1", "2.34", "56", "7"]
4918  *     "hello".split(//)               #=> ["h", "e", "l", "l", "o"]
4919  *     "hello".split(//, 3)            #=> ["h", "e", "llo"]
4920  *     "hi mom".split(%r{\s*})         #=> ["h", "i", "m", "o", "m"]
4921  *
4922  *     "mellow yellow".split("ello")   #=> ["m", "w y", "w"]
4923  *     "1,2,,3,4,,".split(',')         #=> ["1", "2", "", "3", "4"]
4924  *     "1,2,,3,4,,".split(',', 4)      #=> ["1", "2", "", "3,4,,"]
4925  *     "1,2,,3,4,,".split(',', -4)     #=> ["1", "2", "", "3", "4", "", ""]
4926  */
4927
4928 static VALUE
4929 rb_str_split_m(int argc, VALUE *argv, VALUE str)
4930 {
4931     rb_encoding *enc;
4932     VALUE spat;
4933     VALUE limit;
4934     int awk_split = Qfalse;
4935     long beg, end, i = 0;
4936     int lim = 0;
4937     VALUE result, tmp;
4938
4939     if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
4940         lim = NUM2INT(limit);
4941         if (lim <= 0) limit = Qnil;
4942         else if (lim == 1) {
4943             if (RSTRING_LEN(str) == 0)
4944                 return rb_ary_new2(0);
4945             return rb_ary_new3(1, str);
4946         }
4947         i = 1;
4948     }
4949
4950     enc = STR_ENC_GET(str);
4951     if (NIL_P(spat)) {
4952         if (!NIL_P(rb_fs)) {
4953             spat = rb_fs;
4954             goto fs_set;
4955         }
4956         awk_split = Qtrue;
4957     }
4958     else {
4959       fs_set:
4960         if (TYPE(spat) == T_STRING) {
4961             rb_encoding *enc2 = STR_ENC_GET(spat);
4962
4963             if (rb_enc_mbminlen(enc2) == 1) {
4964                 if (RSTRING_LEN(spat) == 1 && RSTRING_PTR(spat)[0] == ' '){
4965                     awk_split = Qtrue;
4966                 }
4967             }
4968             else {
4969                 int l;
4970                 if (rb_enc_ascget(RSTRING_PTR(spat), RSTRING_END(spat), &l, enc2) == ' ' &&
4971                     RSTRING_LEN(spat) == l) {
4972                     awk_split = Qtrue;
4973                 }
4974             }
4975             if (!awk_split) {
4976                 spat = rb_reg_regcomp(rb_reg_quote(spat));
4977             }
4978         }
4979         else {
4980             spat = get_pat(spat, 1);
4981         }
4982     }
4983
4984     result = rb_ary_new();
4985     beg = 0;
4986     if (awk_split) {
4987         char *ptr = RSTRING_PTR(str);
4988         char *eptr = RSTRING_END(str);
4989         char *bptr = ptr;
4990         int skip = 1;
4991         int c;
4992
4993         end = beg;
4994         while (ptr < eptr) {
4995             c = rb_enc_codepoint(ptr, eptr, enc);
4996             ptr += rb_enc_mbclen(ptr, eptr, enc);
4997             if (skip) {
4998                 if (rb_enc_isspace(c, enc)) {
4999                     beg = ptr - bptr;
5000                 }
5001                 else {
5002                     end = ptr - bptr;
5003                     skip = 0;
5004                     if (!NIL_P(limit) && lim <= i) break;
5005                 }
5006             }
5007             else {
5008                 if (rb_enc_isspace(c, enc)) {
5009                     rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
5010                     skip = 1;
5011                     beg = ptr - bptr;
5012                     if (!NIL_P(limit)) ++i;
5013                 }
5014                 else {
5015                     end = ptr - bptr;
5016                 }
5017             }
5018         }
5019     }
5020     else {
5021         long start = beg;
5022         long idx;
5023         int last_null = 0;
5024         struct re_registers *regs;
5025
5026         while ((end = rb_reg_search(spat, str, start, 0)) >= 0) {
5027             regs = RMATCH_REGS(rb_backref_get());
5028             if (start == end && BEG(0) == END(0)) {
5029                 if (!RSTRING_PTR(str)) {
5030                     rb_ary_push(result, rb_str_new("", 0));
5031                     break;
5032                 }
5033                 else if (last_null == 1) {
5034                     rb_ary_push(result, rb_str_subseq(str, beg,
5035                                                       rb_enc_mbclen(RSTRING_PTR(str)+beg,
5036                                                                     RSTRING_END(str),
5037                                                                     enc)));
5038                     beg = start;
5039                 }
5040                 else {
5041                     if (RSTRING_PTR(str)+start == RSTRING_END(str))
5042                         start++;
5043                     else
5044                         start += rb_enc_mbclen(RSTRING_PTR(str)+start,RSTRING_END(str),enc);
5045                     last_null = 1;
5046                     continue;
5047                 }
5048             }
5049             else {
5050                 rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
5051                 beg = start = END(0);
5052             }
5053             last_null = 0;
5054
5055             for (idx=1; idx < regs->num_regs; idx++) {
5056                 if (BEG(idx) == -1) continue;
5057                 if (BEG(idx) == END(idx))
5058                     tmp = rb_str_new5(str, 0, 0);
5059                 else
5060                     tmp = rb_str_subseq(str, BEG(idx), END(idx)-BEG(idx));
5061                 rb_ary_push(result, tmp);
5062             }
5063             if (!NIL_P(limit) && lim <= ++i) break;
5064         }
5065     }
5066     if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
5067         if (RSTRING_LEN(str) == beg)
5068             tmp = rb_str_new5(str, 0, 0);
5069         else
5070             tmp = rb_str_subseq(str, beg, RSTRING_LEN(str)-beg);
5071         rb_ary_push(result, tmp);
5072     }
5073     if (NIL_P(limit) && lim == 0) {
5074         while (RARRAY_LEN(result) > 0 &&
5075                RSTRING_LEN(RARRAY_PTR(result)[RARRAY_LEN(result)-1]) == 0)
5076             rb_ary_pop(result);
5077     }
5078
5079     return result;
5080 }
5081
5082 VALUE
5083 rb_str_split(VALUE str, const char *sep0)
5084 {
5085     VALUE sep;
5086
5087     StringValue(str);
5088     sep = rb_str_new2(sep0);
5089     return rb_str_split_m(1, &sep, str);
5090 }
5091
5092
5093 /*
5094  *  Document-method: lines
5095  *  call-seq:
5096  *     str.lines(separator=$/)   => anEnumerator
5097  *     str.lines(separator=$/) {|substr| block }        => str
5098  *
5099  *  Returns an enumerator that gives each line in the string.  If a block is
5100  *  given, it iterates over each line in the string.
5101  *
5102  *     "foo\nbar\n".lines.to_a   #=> ["foo\n", "bar\n"]
5103  *     "foo\nb ar".lines.sort    #=> ["b ar", "foo\n"]
5104  */
5105
5106 /*
5107  *  Document-method: each_line
5108  *  call-seq:
5109  *     str.each_line(separator=$/) {|substr| block }   => str
5110  *
5111  *  Splits <i>str</i> using the supplied parameter as the record separator
5112  *  (<code>$/</code> by default), passing each substring in turn to the supplied
5113  *  block. If a zero-length record separator is supplied, the string is split
5114  *  into paragraphs delimited by multiple successive newlines.
5115  *
5116  *     print "Example one\n"
5117  *     "hello\nworld".each {|s| p s}
5118  *     print "Example two\n"
5119  *     "hello\nworld".each('l') {|s| p s}
5120  *     print "Example three\n"
5121  *     "hello\n\n\nworld".each('') {|s| p s}
5122  *
5123  *  <em>produces:</em>
5124  *
5125  *     Example one
5126  *     "hello\n"
5127  *     "world"
5128  *     Example two
5129  *     "hel"
5130  *     "l"
5131  *     "o\nworl"
5132  *     "d"
5133  *     Example three
5134  *     "hello\n\n\n"
5135  *     "world"
5136  */
5137
5138 static VALUE
5139 rb_str_each_line(int argc, VALUE *argv, VALUE str)
5140 {
5141     rb_encoding *enc;
5142     VALUE rs;
5143     int newline;
5144     char *p, *pend, *s, *ptr;
5145     long len, rslen;
5146     VALUE line;
5147     int n;
5148     VALUE orig = str;
5149
5150     if (argc == 0) {
5151         rs = rb_rs;
5152     }
5153     else {
5154         rb_scan_args(argc, argv, "01", &rs);
5155     }
5156     RETURN_ENUMERATOR(str, argc, argv);
5157     if (NIL_P(rs)) {
5158         rb_yield(str);
5159         return orig;
5160     }
5161     str = rb_str_new4(str);
5162     ptr = p = s = RSTRING_PTR(str);
5163     pend = p + RSTRING_LEN(str);
5164     len = RSTRING_LEN(str);
5165     StringValue(rs);
5166     if (rs == rb_default_rs) {
5167         enc = rb_enc_get(str);
5168         while (p < pend) {
5169             char *p0;
5170
5171             p = memchr(p, '\n', pend - p);
5172             if (!p) break;
5173             p0 = rb_enc_left_char_head(s, p, enc);
5174             if (!rb_enc_is_newline(p0, pend, enc)) {
5175                 p++;
5176                 continue;
5177             }
5178             p = p0 + rb_enc_mbclen(p0, pend, enc);
5179             line = rb_str_new5(str, s, p - s);
5180             OBJ_INFECT(line, str);
5181             rb_enc_cr_str_copy_for_substr(line, str);
5182             rb_yield(line);
5183             str_mod_check(str, ptr, len);
5184             s = p;
5185         }
5186         goto finish;
5187     }
5188
5189     enc = rb_enc_check(str, rs);
5190     rslen = RSTRING_LEN(rs);
5191     if (rslen == 0) {
5192         newline = '\n';
5193     }
5194     else {
5195         newline = rb_enc_codepoint(RSTRING_PTR(rs), RSTRING_END(rs), enc);
5196     }
5197
5198     while (p < pend) {
5199         int c = rb_enc_codepoint(p, pend, enc);
5200
5201       again:
5202         n = rb_enc_codelen(c, enc);
5203         if (rslen == 0 && c == newline) {
5204             p += n;
5205             if (p < pend && (c = rb_enc_codepoint(p, pend, enc)) != newline) {
5206                 goto again;
5207             }
5208             while (p < pend && rb_enc_codepoint(p, pend, enc) == newline) {
5209                 p += n;
5210             }
5211             p -= n;
5212         }
5213         if (c == newline &&
5214             (rslen <= 1 || memcmp(RSTRING_PTR(rs), p, rslen) == 0)) {
5215             line = rb_str_new5(str, s, p - s + (rslen ? rslen : n));
5216             OBJ_INFECT(line, str);
5217             rb_enc_cr_str_copy_for_substr(line, str);
5218             rb_yield(line);
5219             str_mod_check(str, ptr, len);
5220             s = p + (rslen ? rslen : n);
5221         }
5222         p += n;
5223     }
5224
5225   finish:
5226     if (s != pend) {
5227         line = rb_str_new5(str, s, pend - s);
5228         OBJ_INFECT(line, str);
5229         rb_enc_cr_str_copy_for_substr(line, str);
5230         rb_yield(line);
5231     }
5232
5233     return orig;
5234 }
5235
5236
5237 /*
5238  *  Document-method: bytes
5239  *  call-seq:
5240  *     str.bytes   => anEnumerator
5241  *     str.bytes {|fixnum| block }    => str
5242  *
5243  *  Returns an enumerator that gives each byte in the string.  If a block is
5244  *  given, it iterates over each byte in the string.
5245  *
5246  *     "hello".bytes.to_a        #=> [104, 101, 108, 108, 111]
5247  */
5248
5249 /*
5250  *  Document-method: each_byte
5251  *  call-seq:
5252  *     str.each_byte {|fixnum| block }    => str
5253  *
5254  *  Passes each byte in <i>str</i> to the given block.
5255  *
5256  *     "hello".each_byte {|c| print c, ' ' }
5257  *
5258  *  <em>produces:</em>
5259  *
5260  *     104 101 108 108 111
5261  */
5262
5263 static VALUE
5264 rb_str_each_byte(VALUE str)
5265 {
5266     long i;
5267
5268     RETURN_ENUMERATOR(str, 0, 0);
5269     for (i=0; i<RSTRING_LEN(str); i++) {
5270         rb_yield(INT2FIX(RSTRING_PTR(str)[i] & 0xff));
5271     }
5272     return str;
5273 }
5274
5275
5276 /*
5277  *  Document-method: chars
5278  *  call-seq:
5279  *     str.chars                   => anEnumerator
5280  *     str.chars {|substr| block } => str
5281  *
5282  *  Returns an enumerator that gives each character in the string.
5283  *  If a block is given, it iterates over each character in the string.
5284  *
5285  *     "foo".chars.to_a   #=> ["f","o","o"]
5286  */
5287
5288 /*
5289  *  Document-method: each_char
5290  *  call-seq:
5291  *     str.each_char {|cstr| block }    => str
5292  *
5293  *  Passes each character in <i>str</i> to the given block.
5294  *
5295  *     "hello".each_char {|c| print c, ' ' }
5296  *
5297  *  <em>produces:</em>
5298  *
5299  *     h e l l o
5300  */
5301
5302 static VALUE
5303 rb_str_each_char(VALUE str)
5304 {
5305     int i, len, n;
5306     const char *ptr;
5307     rb_encoding *enc;
5308
5309     RETURN_ENUMERATOR(str, 0, 0);
5310     str = rb_str_new4(str);
5311     ptr = RSTRING_PTR(str);
5312     len = RSTRING_LEN(str);
5313     enc = rb_enc_get(str);
5314     for (i = 0; i < len; i += n) {
5315         n = rb_enc_mbclen(ptr + i, ptr + len, enc);
5316         rb_yield(rb_str_subseq(str, i, n));
5317     }
5318     return str;
5319 }
5320
5321 static long
5322 chopped_length(VALUE str)
5323 {
5324     rb_encoding *enc = STR_ENC_GET(str);
5325     const char *p, *p2, *beg, *end;
5326
5327     beg = RSTRING_PTR(str);
5328     end = beg + RSTRING_LEN(str);
5329     if (beg > end) return 0;
5330     p = rb_enc_prev_char(beg, end, enc);
5331     if (!p) return 0;
5332     if (p > beg && rb_enc_codepoint(p, end, enc) == '\n') {
5333         p2 = rb_enc_prev_char(beg, p, enc);
5334         if (p2 && rb_enc_codepoint(p2, end, enc) == '\r') p = p2;
5335     }
5336     return p - beg;
5337 }
5338
5339 /*
5340  *  call-seq:
5341  *     str.chop!   => str or nil
5342  *
5343  *  Processes <i>str</i> as for <code>String#chop</code>, returning <i>str</i>,
5344  *  or <code>nil</code> if <i>str</i> is the empty string.  See also
5345  *  <code>String#chomp!</code>.
5346  */
5347
5348 static VALUE
5349 rb_str_chop_bang(VALUE str)
5350 {
5351     if (RSTRING_LEN(str) > 0) {
5352         long len;
5353         rb_str_modify(str);
5354         len = chopped_length(str);
5355         STR_SET_LEN(str, len);
5356         RSTRING_PTR(str)[len] = '\0';
5357         return str;
5358     }
5359     return Qnil;
5360 }
5361
5362
5363 /*
5364  *  call-seq:
5365  *     str.chop   => new_str
5366  *
5367  *  Returns a new <code>String</code> with the last character removed.  If the
5368  *  string ends with <code>\r\n</code>, both characters are removed. Applying
5369  *  <code>chop</code> to an empty string returns an empty
5370  *  string. <code>String#chomp</code> is often a safer alternative, as it leaves
5371  *  the string unchanged if it doesn't end in a record separator.
5372  *
5373  *     "string\r\n".chop   #=> "string"
5374  *     "string\n\r".chop   #=> "string\n"
5375  *     "string\n".chop     #=> "string"
5376  *     "string".chop       #=> "strin"
5377  *     "x".chop.chop       #=> ""
5378  */
5379
5380 static VALUE
5381 rb_str_chop(VALUE str)
5382 {
5383     VALUE str2 = rb_str_new5(str, RSTRING_PTR(str), chopped_length(str));
5384     rb_enc_cr_str_copy_for_substr(str2, str);
5385     OBJ_INFECT(str2, str);
5386     return str2;
5387 }
5388
5389
5390 /*
5391  *  call-seq:
5392  *     str.chomp!(separator=$/)   => str or nil
5393  *
5394  *  Modifies <i>str</i> in place as described for <code>String#chomp</code>,
5395  *  returning <i>str</i>, or <code>nil</code> if no modifications were made.
5396  */
5397
5398 static VALUE
5399 rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
5400 {
5401     rb_encoding *enc;
5402     VALUE rs;
5403     int newline;
5404     char *p, *pp, *e;
5405     long len, rslen;
5406
5407     len = RSTRING_LEN(str);
5408     if (len == 0) return Qnil;
5409     p = RSTRING_PTR(str);
5410     e = p + len;
5411     if (argc == 0) {
5412         rs = rb_rs;
5413         if (rs == rb_default_rs) {
5414           smart_chomp:
5415             rb_str_modify(str);
5416             enc = rb_enc_get(str);
5417             if (rb_enc_mbminlen(enc) > 1) {
5418                 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), enc);
5419                 if (rb_enc_is_newline(pp, e, enc)) {
5420                     e = pp;
5421                 }
5422                 pp = e - rb_enc_mbminlen(enc);
5423                 if (pp >= p) {
5424                     pp = rb_enc_left_char_head(p, pp, enc);
5425                     if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
5426                         e = pp;
5427                     }
5428                 }
5429                 if (e == RSTRING_END(str)) {
5430                     return Qnil;
5431                 }
5432                 len = e - RSTRING_PTR(str);
5433                 STR_SET_LEN(str, len);
5434             }
5435             else {
5436                 if (RSTRING_PTR(str)[len-1] == '\n') {
5437                     STR_DEC_LEN(str);
5438                     if (RSTRING_LEN(str) > 0 &&
5439                         RSTRING_PTR(str)[RSTRING_LEN(str)-1] == '\r') {
5440                         STR_DEC_LEN(str);
5441                     }
5442                 }
5443                 else if (RSTRING_PTR(str)[len-1] == '\r') {
5444                     STR_DEC_LEN(str);
5445                 }
5446                 else {
5447                     return Qnil;
5448                 }
5449             }
5450             RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
5451             return str;
5452         }
5453     }
5454     else {
5455         rb_scan_args(argc, argv, "01", &rs);
5456     }
5457     if (NIL_P(rs)) return Qnil;
5458     StringValue(rs);
5459     rslen = RSTRING_LEN(rs);
5460     if (rslen == 0) {
5461         while (len>0 && p[len-1] == '\n') {
5462             len--;
5463             if (len>0 && p[len-1] == '\r')
5464                 len--;
5465         }
5466         if (len < RSTRING_LEN(str)) {
5467             rb_str_modify(str);
5468             STR_SET_LEN(str, len);
5469             RSTRING_PTR(str)[len] = '\0';
5470             return str;
5471         }
5472         return Qnil;
5473     }
5474     if (rslen > len) return Qnil;
5475     newline = RSTRING_PTR(rs)[rslen-1];
5476     if (rslen == 1 && newline == '\n')
5477         goto smart_chomp;
5478
5479     enc = rb_enc_check(str, rs);
5480     if (is_broken_string(rs)) {
5481         return Qnil;
5482     }
5483     pp = e - rslen;
5484     if (p[len-1] == newline &&
5485         (rslen <= 1 ||
5486          memcmp(RSTRING_PTR(rs), pp, rslen) == 0)) {
5487         if (rb_enc_left_char_head(p, pp, enc) != pp)
5488             return Qnil;
5489         rb_str_modify(str);
5490         STR_SET_LEN(str, RSTRING_LEN(str) - rslen);
5491         RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
5492         return str;
5493     }
5494     return Qnil;
5495 }
5496
5497
5498 /*
5499  *  call-seq:
5500  *     str.chomp(separator=$/)   => new_str
5501  *
5502  *  Returns a new <code>String</code> with the given record separator removed
5503  *  from the end of <i>str</i> (if present). If <code>$/</code> has not been
5504  *  changed from the default Ruby record separator, then <code>chomp</code> also
5505  *  removes carriage return characters (that is it will remove <code>\n</code>,
5506  *  <code>\r</code>, and <code>\r\n</code>).
5507  *
5508  *     "hello".chomp            #=> "hello"
5509  *     "hello\n".chomp          #=> "hello"
5510  *     "hello\r\n".chomp        #=> "hello"
5511  *     "hello\n\r".chomp        #=> "hello\n"
5512  *     "hello\r".chomp          #=> "hello"
5513  *     "hello \n there".chomp   #=> "hello \n there"
5514  *     "hello".chomp("llo")     #=> "he"
5515  */
5516
5517 static VALUE
5518 rb_str_chomp(int argc, VALUE *argv, VALUE str)
5519 {
5520     str = rb_str_dup(str);
5521     rb_str_chomp_bang(argc, argv, str);
5522     return str;
5523 }
5524
5525 /*
5526  *  call-seq:
5527  *     str.lstrip!   => self or nil
5528  *
5529  *  Removes leading whitespace from <i>str</i>, returning <code>nil</code> if no
5530  *  change was made. See also <code>String#rstrip!</code> and
5531  *  <code>String#strip!</code>.
5532  *
5533  *     "  hello  ".lstrip   #=> "hello  "
5534  *     "hello".lstrip!      #=> nil
5535  */
5536
5537 static VALUE
5538 rb_str_lstrip_bang(VALUE str)
5539 {
5540     rb_encoding *enc;
5541     char *s, *t, *e;
5542
5543     rb_str_modify(str);
5544     enc = STR_ENC_GET(str);
5545     s = RSTRING_PTR(str);
5546     if (!s || RSTRING_LEN(str) == 0) return Qnil;
5547     e = t = RSTRING_END(str);
5548     /* remove spaces at head */
5549     while (s < e) {
5550         int cc = rb_enc_codepoint(s, e, enc);
5551
5552         if (!rb_enc_isspace(cc, enc)) break;
5553         s += rb_enc_codelen(cc, enc);
5554     }
5555
5556     if (s > RSTRING_PTR(str)) {
5557         rb_str_modify(str);
5558         STR_SET_LEN(str, t-s);
5559         memmove(RSTRING_PTR(str), s, RSTRING_LEN(str));
5560         RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
5561         return str;
5562     }
5563     return Qnil;
5564 }
5565
5566
5567 /*
5568  *  call-seq:
5569  *     str.lstrip   => new_str
5570  *
5571  *  Returns a copy of <i>str</i> with leading whitespace removed. See also
5572  *  <code>String#rstrip</code> and <code>String#strip</code>.
5573  *
5574  *     "  hello  ".lstrip   #=> "hello  "
5575  *     "hello".lstrip       #=> "hello"
5576  */
5577
5578 static VALUE
5579 rb_str_lstrip(VALUE str)
5580 {
5581     str = rb_str_dup(str);
5582     rb_str_lstrip_bang(str);
5583     return str;
5584 }
5585
5586
5587 /*
5588  *  call-seq:
5589  *     str.rstrip!   => self or nil
5590  *
5591  *  Removes trailing whitespace from <i>str</i>, returning <code>nil</code> if
5592  *  no change was made. See also <code>String#lstrip!</code> and
5593  *  <code>String#strip!</code>.
5594  *
5595  *     "  hello  ".rstrip   #=> "  hello"
5596  *     "hello".rstrip!      #=> nil
5597  */
5598
5599 static VALUE
5600 rb_str_rstrip_bang(VALUE str)
5601 {
5602     rb_encoding *enc;
5603     char *s, *t, *e;
5604     int space_seen = Qfalse;
5605
5606     rb_str_modify(str);
5607     enc = STR_ENC_GET(str);
5608     s = RSTRING_PTR(str);
5609     if (!s || RSTRING_LEN(str) == 0) return Qnil;
5610     t = e = RSTRING_END(str);
5611     while (s < e) {
5612         int cc = rb_enc_codepoint(s, e, enc);
5613
5614         if (!cc || rb_enc_isspace(cc, enc)) {
5615             if (!space_seen) t = s;
5616             space_seen = Qtrue;
5617         }
5618         else {
5619             space_seen = Qfalse;
5620         }
5621         s += rb_enc_codelen(cc, enc);
5622     }
5623     if (!space_seen) t = s;
5624     if (t < e) {
5625         rb_str_modify(str);
5626         STR_SET_LEN(str, t-RSTRING_PTR(str));
5627         RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
5628         return str;
5629     }
5630     return Qnil;
5631 }
5632
5633
5634 /*
5635  *  call-seq:
5636  *     str.rstrip   => new_str
5637  *
5638  *  Returns a copy of <i>str</i> with trailing whitespace removed. See also
5639  *  <code>String#lstrip</code> and <code>String#strip</code>.
5640  *
5641  *     "  hello  ".rstrip   #=> "  hello"
5642  *     "hello".rstrip       #=> "hello"
5643  */
5644
5645 static VALUE
5646 rb_str_rstrip(VALUE str)
5647 {
5648     str = rb_str_dup(str);
5649     rb_str_rstrip_bang(str);
5650     return str;
5651 }
5652
5653
5654 /*
5655  *  call-seq:
5656  *     str.strip!   => str or nil
5657  *
5658  *  Removes leading and trailing whitespace from <i>str</i>. Returns
5659  *  <code>nil</code> if <i>str</i> was not altered.
5660  */
5661
5662 static VALUE
5663 rb_str_strip_bang(VALUE str)
5664 {
5665     VALUE l = rb_str_lstrip_bang(str);
5666     VALUE r = rb_str_rstrip_bang(str);
5667
5668     if (NIL_P(l) && NIL_P(r)) return Qnil;
5669     return str;
5670 }
5671
5672
5673 /*
5674  *  call-seq:
5675  *     str.strip   => new_str
5676  *
5677  *  Returns a copy of <i>str</i> with leading and trailing whitespace removed.
5678  *
5679  *     "    hello    ".strip   #=> "hello"
5680  *     "\tgoodbye\r\n".strip   #=> "goodbye"
5681  */
5682
5683 static VALUE
5684 rb_str_strip(VALUE str)
5685 {
5686     str = rb_str_dup(str);
5687     rb_str_strip_bang(str);
5688     return str;
5689 }
5690
5691 static VALUE
5692 scan_once(VALUE str, VALUE pat, long *start)
5693 {
5694     VALUE result, match;
5695     struct re_registers *regs;
5696     long i;
5697
5698     if (rb_reg_search(pat, str, *start, 0) >= 0) {
5699         match = rb_backref_get();
5700         regs = RMATCH_REGS(match);
5701         if (BEG(0) == END(0)) {
5702             rb_encoding *enc = STR_ENC_GET(str);
5703             /*
5704              * Always consume at least one character of the input string
5705              */
5706             if (RSTRING_LEN(str) > END(0))
5707                 *start = END(0)+rb_enc_mbclen(RSTRING_PTR(str)+END(0),
5708                                               RSTRING_END(str), enc);
5709             else
5710                 *start = END(0)+1;
5711         }
5712         else {
5713             *start = END(0);
5714         }
5715         if (regs->num_regs == 1) {
5716             return rb_reg_nth_match(0, match);
5717         }
5718         result = rb_ary_new2(regs->num_regs);
5719         for (i=1; i < regs->num_regs; i++) {
5720             rb_ary_push(result, rb_reg_nth_match(i, match));
5721         }
5722
5723         return result;
5724     }
5725     return Qnil;
5726 }
5727
5728
5729 /*
5730  *  call-seq:
5731  *     str.scan(pattern)                         => array
5732  *     str.scan(pattern) {|match, ...| block }   => str
5733  *
5734  *  Both forms iterate through <i>str</i>, matching the pattern (which may be a
5735  *  <code>Regexp</code> or a <code>String</code>). For each match, a result is
5736  *  generated and either added to the result array or passed to the block. If
5737  *  the pattern contains no groups, each individual result consists of the
5738  *  matched string, <code>$&</code>.  If the pattern contains groups, each
5739  *  individual result is itself an array containing one entry per group.
5740  *
5741  *     a = "cruel world"
5742  *     a.scan(/\w+/)        #=> ["cruel", "world"]
5743  *     a.scan(/.../)        #=> ["cru", "el ", "wor"]
5744  *     a.scan(/(...)/)      #=> [["cru"], ["el "], ["wor"]]
5745  *     a.scan(/(..)(..)/)   #=> [["cr", "ue"], ["l ", "wo"]]
5746  *
5747  *  And the block form:
5748  *
5749  *     a.scan(/\w+/) {|w| print "<<#{w}>> " }
5750  *     print "\n"
5751  *     a.scan(/(.)(.)/) {|x,y| print y, x }
5752  *     print "\n"
5753  *
5754  *  <em>produces:</em>
5755  *
5756  *     <<cruel>> <<world>>
5757  *     rceu lowlr
5758  */
5759
5760 static VALUE
5761 rb_str_scan(VALUE str, VALUE pat)
5762 {
5763     VALUE result;
5764     long start = 0;
5765     long last = -1, prev = 0;
5766     char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
5767
5768     pat = get_pat(pat, 1);
5769     if (!rb_block_given_p()) {
5770         VALUE ary = rb_ary_new();
5771
5772         while (!NIL_P(result = scan_once(str, pat, &start))) {
5773             last = prev;
5774             prev = start;
5775             rb_ary_push(ary, result);
5776         }
5777         if (last >= 0) rb_reg_search(pat, str, last, 0);
5778         return ary;
5779     }
5780
5781     while (!NIL_P(result = scan_once(str, pat, &start))) {
5782         last = prev;
5783         prev = start;
5784         rb_yield(result);
5785         str_mod_check(str, p, len);
5786     }
5787     if (last >= 0) rb_reg_search(pat, str, last, 0);
5788     return str;
5789 }
5790
5791
5792 /*
5793  *  call-seq:
5794  *     str.hex   => integer
5795  *
5796  *  Treats leading characters from <i>str</i> as a string of hexadecimal digits
5797  *  (with an optional sign and an optional <code>0x</code>) and returns the
5798  *  corresponding number. Zero is returned on error.
5799  *
5800  *     "0x0a".hex     #=> 10
5801  *     "-1234".hex    #=> -4660
5802  *     "0".hex        #=> 0
5803  *     "wombat".hex   #=> 0
5804  */
5805
5806 static VALUE
5807 rb_str_hex(VALUE str)
5808 {
5809     rb_encoding *enc = rb_enc_get(str);
5810
5811     if (!rb_enc_asciicompat(enc)) {
5812         rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
5813     }
5814     return rb_str_to_inum(str, 16, Qfalse);
5815 }
5816
5817
5818 /*
5819  *  call-seq:
5820  *     str.oct   => integer
5821  *
5822  *  Treats leading characters of <i>str</i> as a string of octal digits (with an
5823  *  optional sign) and returns the corresponding number.  Returns 0 if the
5824  *  conversion fails.
5825  *
5826  *     "123".oct       #=> 83
5827  *     "-377".oct      #=> -255
5828  *     "bad".oct       #=> 0
5829  *     "0377bad".oct   #=> 255
5830  */
5831
5832 static VALUE
5833 rb_str_oct(VALUE str)
5834 {
5835     rb_encoding *enc = rb_enc_get(str);
5836
5837     if (!rb_enc_asciicompat(enc)) {
5838         rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
5839     }
5840     return rb_str_to_inum(str, -8, Qfalse);
5841 }
5842
5843
5844 /*
5845  *  call-seq:
5846  *     str.crypt(other_str)   => new_str
5847  *
5848  *  Applies a one-way cryptographic hash to <i>str</i> by invoking the standard
5849  *  library function <code>crypt</code>. The argument is the salt string, which
5850  *  should be two characters long, each character drawn from
5851  *  <code>[a-zA-Z0-9./]</code>.
5852  */
5853
5854 static VALUE
5855 rb_str_crypt(VALUE str, VALUE salt)
5856 {
5857     extern char *crypt(const char *, const char *);
5858     VALUE result;
5859     const char *s;
5860
5861     StringValue(salt);
5862     if (RSTRING_LEN(salt) < 2)
5863         rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
5864
5865     if (RSTRING_PTR(str)) s = RSTRING_PTR(str);
5866     else s = "";
5867     result = rb_str_new2(crypt(s, RSTRING_PTR(salt)));
5868     OBJ_INFECT(result, str);
5869     OBJ_INFECT(result, salt);
5870     return result;
5871 }
5872
5873
5874 /*
5875  *  call-seq:
5876  *     str.intern   => symbol
5877  *     str.to_sym   => symbol
5878  *
5879  *  Returns the <code>Symbol</code> corresponding to <i>str</i>, creating the
5880  *  symbol if it did not previously exist. See <code>Symbol#id2name</code>.
5881  *
5882  *     "Koala".intern         #=> :Koala
5883  *     s = 'cat'.to_sym       #=> :cat
5884  *     s == :cat              #=> true
5885  *     s = '@cat'.to_sym      #=> :@cat
5886  *     s == :@cat             #=> true
5887  *
5888  *  This can also be used to create symbols that cannot be represented using the
5889  *  <code>:xxx</code> notation.
5890  *
5891  *     'cat and dog'.to_sym   #=> :"cat and dog"
5892  */
5893
5894 VALUE
5895 rb_str_intern(VALUE s)
5896 {
5897     VALUE str = RB_GC_GUARD(s);
5898     VALUE sym;
5899     ID id, id2;
5900
5901     id = rb_intern_str(str);
5902     sym = ID2SYM(id);
5903     id2 = SYM2ID(sym);
5904     if (id != id2) {
5905         const char *name = rb_id2name(id2);
5906
5907         if (name) {
5908             rb_raise(rb_eRuntimeError, "symbol table overflow (%s given for %s)",
5909                      name, RSTRING_PTR(str));
5910         }
5911         else {
5912             rb_raise(rb_eRuntimeError, "symbol table overflow (symbol %s)",
5913                      RSTRING_PTR(str));
5914         }
5915     }
5916     return sym;
5917 }
5918
5919
5920 /*
5921  *  call-seq:
5922  *     str.ord   => integer
5923  *
5924  *  Return the <code>Integer</code> ordinal of a one-character string.
5925  *
5926  *     "a".ord         #=> 97
5927  */
5928
5929 VALUE
5930 rb_str_ord(VALUE s)
5931 {
5932     int c;
5933
5934     c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s));
5935     return INT2NUM(c);
5936 }
5937 /*
5938  *  call-seq:
5939  *     str.sum(n=16)   => integer
5940  *
5941  *  Returns a basic <em>n</em>-bit checksum of the characters in <i>str</i>,
5942  *  where <em>n</em> is the optional <code>Fixnum</code> parameter, defaulting
5943  *  to 16. The result is simply the sum of the binary value of each character in
5944  *  <i>str</i> modulo <code>2n - 1</code>. This is not a particularly good
5945  *  checksum.
5946  */
5947
5948 static VALUE
5949 rb_str_sum(int argc, VALUE *argv, VALUE str)
5950 {
5951     VALUE vbits;
5952     int bits;
5953     char *ptr, *p, *pend;
5954     long len;
5955
5956     if (argc == 0) {
5957         bits = 16;
5958     }
5959     else {
5960         rb_scan_args(argc, argv, "01", &vbits);
5961         bits = NUM2INT(vbits);
5962     }
5963     ptr = p = RSTRING_PTR(str);
5964     len = RSTRING_LEN(str);
5965     pend = p + len;
5966     if (bits >= sizeof(long)*CHAR_BIT) {
5967         VALUE sum = INT2FIX(0);
5968
5969         while (p < pend) {
5970             str_mod_check(str, ptr, len);
5971             sum = rb_funcall(sum, '+', 1, INT2FIX((unsigned char)*p));
5972             p++;
5973         }
5974         if (bits != 0) {
5975             VALUE mod;
5976
5977             mod = rb_funcall(INT2FIX(1), rb_intern("<<"), 1, INT2FIX(bits));
5978             mod = rb_funcall(mod, '-', 1, INT2FIX(1));
5979             sum = rb_funcall(sum, '&', 1, mod);
5980         }
5981         return sum;
5982     }
5983     else {
5984        unsigned long sum = 0;
5985
5986         while (p < pend) {
5987             str_mod_check(str, ptr, len);
5988             sum += (unsigned char)*p;
5989             p++;
5990         }
5991         if (bits != 0) {
5992            sum &= (((unsigned long)1)<<bits)-1;
5993         }
5994         return rb_int2inum(sum);
5995     }
5996 }
5997
5998 static VALUE
5999 rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
6000 {
6001     rb_encoding *enc;
6002     VALUE w;
6003     long width, len, flen = 1, fclen = 1;
6004     VALUE res;
6005     char *p;
6006     const char *f = " ";
6007     long n, llen, rlen;
6008     volatile VALUE pad;
6009     int singlebyte = 1;
6010
6011     rb_scan_args(argc, argv, "11", &w, &pad);
6012     enc = STR_ENC_GET(str);
6013     width = NUM2LONG(w);
6014     if (argc == 2) {
6015         StringValue(pad);
6016         enc = rb_enc_check(str, pad);
6017         f = RSTRING_PTR(pad);
6018         flen = RSTRING_LEN(pad);
6019         fclen = str_strlen(pad, enc);
6020         singlebyte = single_byte_optimizable(pad);
6021         if (flen == 0 || fclen == 0) {
6022             rb_raise(rb_eArgError, "zero width padding");
6023         }
6024     }
6025     len = str_strlen(str, enc);
6026     if (width < 0 || len >= width) return rb_str_dup(str);
6027     n = width - len;
6028     llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
6029     rlen = n - llen;
6030     res = rb_str_new5(str, 0, RSTRING_LEN(str)+n*flen/fclen+2);
6031     p = RSTRING_PTR(res);
6032     while (llen) {
6033         if (flen <= 1) {
6034             *p++ = *f;
6035             llen--;
6036         }
6037         else if (llen > fclen) {
6038             memcpy(p,f,flen);
6039             p += flen;
6040             llen -= fclen;
6041         }
6042         else {
6043             char *fp = str_nth(f, f+flen, llen, enc, singlebyte);
6044             n = fp - f;
6045             memcpy(p,f,n);
6046             p+=n;
6047             break;
6048         }
6049     }
6050     memcpy(p, RSTRING_PTR(str), RSTRING_LEN(str));
6051     p+=RSTRING_LEN(str);
6052     while (rlen) {
6053         if (flen <= 1) {
6054             *p++ = *f;
6055             rlen--;
6056         }
6057         else if (rlen > fclen) {
6058             memcpy(p,f,flen);
6059             p += flen;
6060             rlen -= fclen;
6061         }
6062         else {
6063             char *fp = str_nth(f, f+flen, rlen, enc, singlebyte);
6064             n = fp - f;
6065             memcpy(p,f,n);
6066             p+=n;
6067             break;
6068         }
6069     }
6070     *p = '\0';
6071     STR_SET_LEN(res, p-RSTRING_PTR(res));
6072     OBJ_INFECT(res, str);
6073     if (!NIL_P(pad)) OBJ_INFECT(res, pad);
6074     rb_enc_associate(res, enc);
6075     return res;
6076 }
6077
6078
6079 /*
6080  *  call-seq:
6081  *     str.ljust(integer, padstr=' ')   => new_str
6082  *
6083  *  If <i>integer</i> is greater than the length of <i>str</i>, returns a new
6084  *  <code>String</code> of length <i>integer</i> with <i>str</i> left justified
6085  *  and padded with <i>padstr</i>; otherwise, returns <i>str</i>.
6086  *
6087  *     "hello".ljust(4)            #=> "hello"
6088  *     "hello".ljust(20)           #=> "hello               "
6089  *     "hello".ljust(20, '1234')   #=> "hello123412341234123"
6090  */
6091
6092 static VALUE
6093 rb_str_ljust(int argc, VALUE *argv, VALUE str)
6094 {
6095     return rb_str_justify(argc, argv, str, 'l');
6096 }
6097
6098
6099 /*
6100  *  call-seq:
6101  *     str.rjust(integer, padstr=' ')   => new_str
6102  *
6103  *  If <i>integer</i> is greater than the length of <i>str</i>, returns a new
6104  *  <code>String</code> of length <i>integer</i> with <i>str</i> right justified
6105  *  and padded with <i>padstr</i>; otherwise, returns <i>str</i>.
6106  *
6107  *     "hello".rjust(4)            #=> "hello"
6108  *     "hello".rjust(20)           #=> "               hello"
6109  *     "hello".rjust(20, '1234')   #=> "123412341234123hello"
6110  */
6111
6112 static VALUE
6113 rb_str_rjust(int argc, VALUE *argv, VALUE str)
6114 {
6115     return rb_str_justify(argc, argv, str, 'r');
6116 }
6117
6118
6119 /*
6120  *  call-seq:
6121  *     str.center(integer, padstr)   => new_str
6122  *
6123  *  If <i>integer</i> is greater than the length of <i>str</i>, returns a new
6124  *  <code>String</code> of length <i>integer</i> with <i>str</i> centered and
6125  *  padded with <i>padstr</i>; otherwise, returns <i>str</i>.
6126  *
6127  *     "hello".center(4)         #=> "hello"
6128  *     "hello".center(20)        #=> "       hello        "
6129  *     "hello".center(20, '123') #=> "1231231hello12312312"
6130  */
6131
6132 static VALUE
6133 rb_str_center(int argc, VALUE *argv, VALUE str)
6134 {
6135     return rb_str_justify(argc, argv, str, 'c');
6136 }
6137
6138 /*
6139  *  call-seq:
6140  *     str.partition(sep)              => [head, sep, tail]
6141  *
6142  *  Searches the string for <i>sep</i> and returns the part before
6143  *  it, the <i>sep</i>, and the part after it.  If <i>sep</i> is not found,
6144  *  returns <i>str</i> and two empty strings.
6145  *
6146  *     "hello".partition("l")         #=> ["he", "l", "lo"]
6147  *     "hello".partition("x")         #=> ["hello", "", ""]
6148  */
6149
6150 static VALUE
6151 rb_str_partition(VALUE str, VALUE sep)
6152 {
6153     long pos;
6154     int regex = Qfalse;
6155
6156     if (TYPE(sep) == T_REGEXP) {
6157         pos = rb_reg_search(sep, str, 0, 0);
6158         regex = Qtrue;
6159     }
6160     else {
6161         VALUE tmp;
6162
6163         tmp = rb_check_string_type(sep);
6164         if (NIL_P(tmp)) {
6165             rb_raise(rb_eTypeError, "type mismatch: %s given",
6166                      rb_obj_classname(sep));
6167         }
6168         pos = rb_str_index(str, sep, 0);
6169     }
6170     if (pos < 0) {
6171       failed:
6172         return rb_ary_new3(3, str, rb_str_new(0,0),rb_str_new(0,0));
6173     }
6174     if (regex) {
6175         sep = rb_str_subpat(str, sep, 0);
6176         if (pos == 0 && RSTRING_LEN(sep) == 0) goto failed;
6177     }
6178     return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
6179                           sep,
6180                           rb_str_subseq(str, pos+RSTRING_LEN(sep),
6181                                              RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
6182 }
6183
6184 /*
6185  *  call-seq:
6186  *     str.rpartition(sep)            => [head, sep, tail]
6187  *
6188  *  Searches <i>sep</i> in the string from the end of the string, and
6189  *  returns the part before it, the <i>sep</i>, and the part after it.
6190  *  If <i>sep</i> is not found, returns two empty strings and
6191  *  <i>str</i>.
6192  *
6193  *     "hello".rpartition("l")         #=> ["hel", "l", "o"]
6194  *     "hello".rpartition("x")         #=> ["", "", "hello"]
6195  */
6196
6197 static VALUE
6198 rb_str_rpartition(VALUE str, VALUE sep)
6199 {
6200     long pos = RSTRING_LEN(str);
6201     int regex = Qfalse;
6202
6203     if (TYPE(sep) == T_REGEXP) {
6204         pos = rb_reg_search(sep, str, pos, 1);
6205         regex = Qtrue;
6206     }
6207     else {
6208         VALUE tmp;
6209
6210         tmp = rb_check_string_type(sep);
6211         if (NIL_P(tmp)) {
6212             rb_raise(rb_eTypeError, "type mismatch: %s given",
6213                      rb_obj_classname(sep));
6214         }
6215         pos = rb_str_sublen(str, pos);
6216         pos = rb_str_rindex(str, sep, pos);
6217     }
6218     if (pos < 0) {
6219         return rb_ary_new3(3, rb_str_new(0,0),rb_str_new(0,0), str);
6220     }
6221     if (regex) {
6222         sep = rb_reg_nth_match(0, rb_backref_get());
6223     }
6224     return rb_ary_new3(3, rb_str_substr(str, 0, pos),
6225                           sep,
6226                           rb_str_substr(str,pos+str_strlen(sep,STR_ENC_GET(sep)),RSTRING_LEN(str)));
6227 }
6228
6229 /*
6230  *  call-seq:
6231  *     str.start_with?([prefix]+)   => true or false
6232  *
6233  *  Returns true if <i>str</i> starts with the prefix given.
6234  */
6235
6236 static VALUE
6237 rb_str_start_with(int argc, VALUE *argv, VALUE str)
6238 {
6239     int i;
6240
6241     for (i=0; i<argc; i++) {
6242         VALUE tmp = rb_check_string_type(argv[i]);
6243         if (NIL_P(tmp)) continue;
6244         rb_enc_check(str, tmp);
6245         if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
6246         if (memcmp(RSTRING_PTR(str), RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
6247             return Qtrue;
6248     }
6249     return Qfalse;
6250 }
6251
6252 /*
6253  *  call-seq:
6254  *     str.end_with?([suffix]+)   => true or false
6255  *
6256  *  Returns true if <i>str</i> ends with the suffix given.
6257  */
6258
6259 static VALUE
6260 rb_str_end_with(int argc, VALUE *argv, VALUE str)
6261 {
6262     int i;
6263     char *p, *s;
6264     rb_encoding *enc;
6265
6266     for (i=0; i<argc; i++) {
6267         VALUE tmp = rb_check_string_type(argv[i]);
6268         if (NIL_P(tmp)) continue;
6269         enc = rb_enc_check(str, tmp);
6270         if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
6271         p = RSTRING_PTR(str);
6272         s = p + RSTRING_LEN(str) - RSTRING_LEN(tmp);
6273         if (rb_enc_left_char_head(p, s, enc) != s)
6274             continue;
6275         if (memcmp(s, RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
6276             return Qtrue;
6277     }
6278     return Qfalse;
6279 }
6280
6281 void
6282 rb_str_setter(VALUE val, ID id, VALUE *var)
6283 {
6284     if (!NIL_P(val) && TYPE(val) != T_STRING) {
6285         rb_raise(rb_eTypeError, "value of %s must be String", rb_id2name(id));
6286     }
6287     *var = val;
6288 }
6289
6290
6291 /*
6292  *  call-seq:
6293  *     str.force_encoding(encoding)   => str
6294  *
6295  *  Changes the encoding to +encoding+ and returns self.
6296  */
6297
6298 static VALUE
6299 rb_str_force_encoding(VALUE str, VALUE enc)
6300 {
6301     str_modifiable(str);
6302     rb_enc_associate(str, rb_to_encoding(enc));
6303     return str;
6304 }
6305
6306 /*
6307  *  call-seq:
6308  *     str.valid_encoding?  => true or false
6309  *
6310  *  Returns true for a string which encoded correctly.
6311  *
6312  *    "\xc2\xa1".force_encoding("UTF-8").valid_encoding? => true
6313  *    "\xc2".force_encoding("UTF-8").valid_encoding? => false
6314  *    "\x80".force_encoding("UTF-8").valid_encoding? => false
6315  */
6316
6317 static VALUE
6318 rb_str_valid_encoding_p(VALUE str)
6319 {
6320     int cr = rb_enc_str_coderange(str);
6321
6322     return cr == ENC_CODERANGE_BROKEN ? Qfalse : Qtrue;
6323 }
6324
6325 /*
6326  *  call-seq:
6327  *     str.ascii_only?  => true or false
6328  *
6329  *  Returns true for a string which has only ASCII characters.
6330  *
6331  *    "abc".force_encoding("UTF-8").ascii_only? => true
6332  *    "abc\u{6666}".force_encoding("UTF-8").ascii_only? => false
6333  */
6334
6335 static VALUE
6336 rb_str_is_ascii_only_p(VALUE str)
6337 {
6338     int cr = rb_enc_str_coderange(str);
6339
6340     return cr == ENC_CODERANGE_7BIT ? Qtrue : Qfalse;
6341 }
6342
6343 /**********************************************************************
6344  * Document-class: Symbol
6345  *
6346  *  <code>Symbol</code> objects represent names and some strings
6347  *  inside the Ruby
6348  *  interpreter. They are generated using the <code>:name</code> and
6349  *  <code>:"string"</code> literals
6350  *  syntax, and by the various <code>to_sym</code> methods. The same
6351  *  <code>Symbol</code> object will be created for a given name or string
6352  *  for the duration of a program's execution, regardless of the context
6353  *  or meaning of that name. Thus if <code>Fred</code> is a constant in
6354  *  one context, a method in another, and a class in a third, the
6355  *  <code>Symbol</code> <code>:Fred</code> will be the same object in
6356  *  all three contexts.
6357  *
6358  *     module One
6359  *       class Fred
6360  *       end
6361  *       $f1 = :Fred
6362  *     end
6363  *     module Two
6364  *       Fred = 1
6365  *       $f2 = :Fred
6366  *     end
6367  *     def Fred()
6368  *     end
6369  *     $f3 = :Fred
6370  *     $f1.object_id   #=> 2514190
6371  *     $f2.object_id   #=> 2514190
6372  *     $f3.object_id   #=> 2514190
6373  *
6374  */
6375
6376
6377 /*
6378  *  call-seq:
6379  *     sym == obj   => true or false
6380  *
6381  *  Equality---If <i>sym</i> and <i>obj</i> are exactly the same
6382  *  symbol, returns <code>true</code>. Otherwise, compares them
6383  *  as strings.
6384  */
6385
6386 static VALUE
6387 sym_equal(VALUE sym1, VALUE sym2)
6388 {
6389     if (sym1 == sym2) return Qtrue;
6390     return Qfalse;
6391 }
6392
6393
6394 /*
6395  *  call-seq:
6396  *     sym.inspect    => string
6397  *
6398  *  Returns the representation of <i>sym</i> as a symbol literal.
6399  *
6400  *     :fred.inspect   #=> ":fred"
6401  */
6402
6403 static VALUE
6404 sym_inspect(VALUE sym)
6405 {
6406     VALUE str;
6407     ID id = SYM2ID(sym);
6408     rb_encoding *enc;
6409
6410     sym = rb_id2str(id);
6411     enc = STR_ENC_GET(sym);
6412     str = rb_enc_str_new(0, RSTRING_LEN(sym)+1, enc);
6413     RSTRING_PTR(str)[0] = ':';
6414     memcpy(RSTRING_PTR(str)+1, RSTRING_PTR(sym), RSTRING_LEN(sym));
6415     if (RSTRING_LEN(sym) != strlen(RSTRING_PTR(sym)) ||
6416         !rb_enc_symname_p(RSTRING_PTR(sym), enc)) {
6417         str = rb_str_inspect(str);
6418         strncpy(RSTRING_PTR(str), ":\"", 2);
6419     }
6420     return str;
6421 }
6422
6423
6424 /*
6425  *  call-seq:
6426  *     sym.id2name   => string
6427  *     sym.to_s      => string
6428  *
6429  *  Returns the name or string corresponding to <i>sym</i>.
6430  *
6431  *     :fred.id2name   #=> "fred"
6432  */
6433
6434
6435 VALUE
6436 rb_sym_to_s(VALUE sym)
6437 {
6438     ID id = SYM2ID(sym);
6439
6440     return str_new3(rb_cString, rb_id2str(id));
6441 }
6442
6443
6444 /*
6445  * call-seq:
6446  *   sym.to_sym   => sym
6447  *   sym.intern   => sym
6448  *
6449  * In general, <code>to_sym</code> returns the <code>Symbol</code> corresponding
6450  * to an object. As <i>sym</i> is already a symbol, <code>self</code> is returned
6451  * in this case.
6452  */
6453
6454 static VALUE
6455 sym_to_sym(VALUE sym)
6456 {
6457     return sym;
6458 }
6459
6460 static VALUE
6461 sym_call(VALUE args, VALUE sym, int argc, VALUE *argv)
6462 {
6463     VALUE obj;
6464
6465     if (argc < 1) {
6466         rb_raise(rb_eArgError, "no receiver given");
6467     }
6468     obj = argv[0];
6469     return rb_funcall3(obj, (ID)sym, argc - 1, argv + 1);
6470 }
6471
6472 /*
6473  * call-seq:
6474  *   sym.to_proc
6475  *
6476  * Returns a _Proc_ object which respond to the given method by _sym_.
6477  *
6478  *   (1..3).collect(&:to_s)  #=> ["1", "2", "3"]
6479  */
6480
6481 static VALUE
6482 sym_to_proc(VALUE sym)
6483 {
6484     return rb_proc_new(sym_call, (VALUE)SYM2ID(sym));
6485 }
6486
6487
6488 static VALUE
6489 sym_succ(VALUE sym)
6490 {
6491     return rb_str_intern(rb_str_succ(rb_sym_to_s(sym)));
6492 }
6493
6494 static VALUE
6495 sym_cmp(VALUE sym, VALUE other)
6496 {
6497     if (!SYMBOL_P(other)) {
6498         return Qnil;
6499     }
6500     return rb_str_cmp_m(rb_sym_to_s(sym), rb_sym_to_s(other));
6501 }
6502
6503 static VALUE
6504 sym_casecmp(VALUE sym, VALUE other)
6505 {
6506     if (!SYMBOL_P(other)) {
6507         return Qnil;
6508     }
6509     return rb_str_casecmp(rb_sym_to_s(sym), rb_sym_to_s(other));
6510 }
6511
6512 static VALUE
6513 sym_match(VALUE sym, VALUE other)
6514 {
6515     return rb_str_match(rb_sym_to_s(sym), other);
6516 }
6517
6518 static VALUE
6519 sym_eqq(VALUE sym, VALUE other)
6520 {
6521     if (sym == other) return Qtrue;
6522     return rb_str_equal(rb_sym_to_s(sym), other);
6523 }
6524
6525 static VALUE
6526 sym_aref(int argc, VALUE *argv, VALUE sym)
6527 {
6528     return rb_str_aref_m(argc, argv, rb_sym_to_s(sym));
6529 }
6530
6531 static VALUE
6532 sym_length(VALUE sym)
6533 {
6534     return rb_str_length(rb_id2str(SYM2ID(sym)));
6535 }
6536
6537 static VALUE
6538 sym_empty(VALUE sym)
6539 {
6540     return rb_str_empty(rb_id2str(SYM2ID(sym)));
6541 }
6542
6543 static VALUE
6544 sym_upcase(VALUE sym)
6545 {
6546     return rb_str_intern(rb_str_upcase(rb_id2str(SYM2ID(sym))));
6547 }
6548
6549 static VALUE
6550 sym_downcase(VALUE sym)
6551 {
6552     return rb_str_intern(rb_str_downcase(rb_id2str(SYM2ID(sym))));
6553 }
6554
6555 static VALUE
6556 sym_capitalize(VALUE sym)
6557 {
6558     return rb_str_intern(rb_str_capitalize(rb_id2str(SYM2ID(sym))));
6559 }
6560
6561 static VALUE
6562 sym_swapcase(VALUE sym)
6563 {
6564     return rb_str_intern(rb_str_swapcase(rb_id2str(SYM2ID(sym))));
6565 }
6566
6567 static VALUE
6568 sym_encoding(VALUE sym)
6569 {
6570     return rb_obj_encoding(rb_id2str(SYM2ID(sym)));
6571 }
6572
6573 ID
6574 rb_to_id(VALUE name)
6575 {
6576     VALUE tmp;
6577     ID id;
6578
6579     switch (TYPE(name)) {
6580       default:
6581         tmp = rb_check_string_type(name);
6582         if (NIL_P(tmp)) {
6583             rb_raise(rb_eTypeError, "%s is not a symbol",
6584                      RSTRING_PTR(rb_inspect(name)));
6585         }
6586         name = tmp;
6587         /* fall through */
6588       case T_STRING:
6589         name = rb_str_intern(name);
6590         /* fall through */
6591       case T_SYMBOL:
6592         return SYM2ID(name);
6593     }
6594     return id;
6595 }
6596
6597 /*
6598  *  A <code>String</code> object holds and manipulates an arbitrary sequence of
6599  *  bytes, typically representing characters. String objects may be created
6600  *  using <code>String::new</code> or as literals.
6601  *
6602  *  Because of aliasing issues, users of strings should be aware of the methods
6603  *  that modify the contents of a <code>String</code> object.  Typically,
6604  *  methods with names ending in ``!'' modify their receiver, while those
6605  *  without a ``!'' return a new <code>String</code>.  However, there are
6606  *  exceptions, such as <code>String#[]=</code>.
6607  *
6608  */
6609
6610 void
6611 Init_String(void)
6612 {
6613 #undef rb_intern
6614 #define rb_intern(str) rb_intern_const(str)
6615
6616     rb_cString  = rb_define_class("String", rb_cObject);
6617     rb_include_module(rb_cString, rb_mComparable);
6618     rb_define_alloc_func(rb_cString, str_alloc);
6619     rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1);
6620     rb_define_method(rb_cString, "initialize", rb_str_init, -1);
6621     rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
6622     rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1);
6623     rb_define_method(rb_cString, "==", rb_str_equal, 1);
6624     rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
6625     rb_define_method(rb_cString, "hash", rb_str_hash_m, 0);
6626     rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
6627     rb_define_method(rb_cString, "+", rb_str_plus, 1);
6628     rb_define_method(rb_cString, "*", rb_str_times, 1);
6629     rb_define_method(rb_cString, "%", rb_str_format_m, 1);
6630     rb_define_method(rb_cString, "[]", rb_str_aref_m, -1);
6631     rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1);
6632     rb_define_method(rb_cString, "insert", rb_str_insert, 2);
6633     rb_define_method(rb_cString, "length", rb_str_length, 0);
6634     rb_define_method(rb_cString, "size", rb_str_length, 0);
6635     rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
6636     rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
6637     rb_define_method(rb_cString, "=~", rb_str_match, 1);
6638     rb_define_method(rb_cString, "match", rb_str_match_m, -1);
6639     rb_define_method(rb_cString, "succ", rb_str_succ, 0);
6640     rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0);
6641     rb_define_method(rb_cString, "next", rb_str_succ, 0);
6642     rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0);
6643     rb_define_method(rb_cString, "upto", rb_str_upto, -1);
6644     rb_define_method(rb_cString, "index", rb_str_index_m, -1);
6645     rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1);
6646     rb_define_method(rb_cString, "replace", rb_str_replace, 1);
6647     rb_define_method(rb_cString, "clear", rb_str_clear, 0);
6648     rb_define_method(rb_cString, "chr", rb_str_chr, 0);
6649     rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
6650     rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
6651
6652     rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
6653     rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
6654     rb_define_method(rb_cString, "to_s", rb_str_to_s, 0);
6655     rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
6656     rb_define_method(rb_cString, "inspect", rb_str_inspect, 0);
6657     rb_define_method(rb_cString, "dump", rb_str_dump, 0);
6658
6659     rb_define_method(rb_cString, "upcase", rb_str_upcase, 0);
6660     rb_define_method(rb_cString, "downcase", rb_str_downcase, 0);
6661     rb_define_method(rb_cString, "capitalize", rb_str_capitalize, 0);
6662     rb_define_method(rb_cString, "swapcase", rb_str_swapcase, 0);
6663
6664     rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, 0);
6665     rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, 0);
6666     rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, 0);
6667     rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, 0);
6668
6669     rb_define_method(rb_cString, "hex", rb_str_hex, 0);
6670     rb_define_method(rb_cString, "oct", rb_str_oct, 0);
6671     rb_define_method(rb_cString, "split", rb_str_split_m, -1);
6672     rb_define_method(rb_cString, "lines", rb_str_each_line, -1);
6673     rb_define_method(rb_cString, "bytes", rb_str_each_byte, 0);
6674     rb_define_method(rb_cString, "chars", rb_str_each_char, 0);
6675     rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
6676     rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
6677     rb_define_method(rb_cString, "concat", rb_str_concat, 1);
6678     rb_define_method(rb_cString, "<<", rb_str_concat, 1);
6679     rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
6680     rb_define_method(rb_cString, "intern", rb_str_intern, 0);
6681     rb_define_method(rb_cString, "to_sym", rb_str_intern, 0);
6682     rb_define_method(rb_cString, "ord", rb_str_ord, 0);
6683
6684     rb_define_method(rb_cString, "include?", rb_str_include, 1);
6685     rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
6686     rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
6687
6688     rb_define_method(rb_cString, "scan", rb_str_scan, 1);
6689
6690     rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
6691     rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
6692     rb_define_method(rb_cString, "center", rb_str_center, -1);
6693
6694     rb_define_method(rb_cString, "sub", rb_str_sub, -1);
6695     rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
6696     rb_define_method(rb_cString, "chop", rb_str_chop, 0);
6697     rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
6698     rb_define_method(rb_cString, "strip", rb_str_strip, 0);
6699     rb_define_method(rb_cString, "lstrip", rb_str_lstrip, 0);
6700     rb_define_method(rb_cString, "rstrip", rb_str_rstrip, 0);
6701
6702     rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1);
6703     rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1);
6704     rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0);
6705     rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1);
6706     rb_define_method(rb_cString, "strip!", rb_str_strip_bang, 0);
6707     rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, 0);
6708     rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, 0);
6709
6710     rb_define_method(rb_cString, "tr", rb_str_tr, 2);
6711     rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2);
6712     rb_define_method(rb_cString, "delete", rb_str_delete, -1);
6713     rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
6714     rb_define_method(rb_cString, "count", rb_str_count, -1);
6715
6716     rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2);
6717     rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2);
6718     rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1);
6719     rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1);
6720
6721     rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
6722     rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
6723     rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
6724
6725     rb_define_method(rb_cString, "sum", rb_str_sum, -1);
6726
6727     rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
6728     rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1);
6729
6730     rb_define_method(rb_cString, "partition", rb_str_partition, 1);
6731     rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
6732
6733     rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */
6734     rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
6735     rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
6736     rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0);
6737
6738     id_to_s = rb_intern("to_s");
6739
6740     rb_fs = Qnil;
6741     rb_define_variable("$;", &rb_fs);
6742     rb_define_variable("$-F", &rb_fs);
6743
6744     rb_cSymbol = rb_define_class("Symbol", rb_cObject);
6745     rb_include_module(rb_cSymbol, rb_mComparable);
6746     rb_undef_alloc_func(rb_cSymbol);
6747     rb_undef_method(CLASS_OF(rb_cSymbol), "new");
6748     rb_define_singleton_method(rb_cSymbol, "all_symbols", rb_sym_all_symbols, 0); /* in parse.y */
6749
6750     rb_define_method(rb_cSymbol, "==", sym_equal, 1);
6751     rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
6752     rb_define_method(rb_cSymbol, "to_s", rb_sym_to_s, 0);
6753     rb_define_method(rb_cSymbol, "id2name", rb_sym_to_s, 0);
6754     rb_define_method(rb_cSymbol, "intern", sym_to_sym, 0);
6755     rb_define_method(rb_cSymbol, "to_sym", sym_to_sym, 0);
6756     rb_define_method(rb_cSymbol, "to_proc", sym_to_proc, 0);
6757     rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
6758     rb_define_method(rb_cSymbol, "next", sym_succ, 0);
6759
6760     rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
6761     rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
6762     rb_define_method(rb_cSymbol, "=~", sym_match, 1);
6763     rb_define_method(rb_cSymbol, "===", sym_eqq, 1);
6764
6765     rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
6766     rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
6767     rb_define_method(rb_cSymbol, "length", sym_length, 0);
6768     rb_define_method(rb_cSymbol, "size", sym_length, 0);
6769     rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
6770     rb_define_method(rb_cSymbol, "match", sym_match, 1);
6771
6772     rb_define_method(rb_cSymbol, "upcase", sym_upcase, 0);
6773     rb_define_method(rb_cSymbol, "downcase", sym_downcase, 0);
6774     rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, 0);
6775     rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, 0);
6776
6777     rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);
6778 }