string.c

   1 /**********************************************************************
   2
   3   string.c -
   4
   5   $Author$
   6   created at: Mon Aug  9 17:12:58 JST 1993
   7
   8   Copyright (C) 1993-2007 Yukihiro Matsumoto
   9   Copyright (C) 2000  Network Applied Communication Laboratory, Inc.
  10   Copyright (C) 2000  Information-technology Promotion Agency, Japan
  11
  12 **********************************************************************/
  13
  14 #include "ruby/ruby.h"
  15 #include "ruby/re.h"
  16 #include "ruby/encoding.h"
  17
  18 #define BEG(no) regs->beg[no]
  19 #define END(no) regs->end[no]
  20
  21 #include <math.h>
  22 #include <ctype.h>
  23
  24 #ifdef HAVE_UNISTD_H
  25 #include <unistd.h>
  26 #endif
  27
  28 VALUE rb_cString;
  29 VALUE rb_cSymbol;
  30
  31 #define STR_TMPLOCK FL_USER7
  32 #define STR_NOEMBED FL_USER1
  33 #define STR_SHARED  FL_USER2 /* = ELTS_SHARED */
  34 #define STR_ASSOC   FL_USER3
  35 #define STR_SHARED_P(s) FL_ALL(s, STR_NOEMBED|ELTS_SHARED)
  36 #define STR_ASSOC_P(s)  FL_ALL(s, STR_NOEMBED|STR_ASSOC)
  37 #define STR_NOCAPA  (STR_NOEMBED|ELTS_SHARED|STR_ASSOC)
  38 #define STR_NOCAPA_P(s) (FL_TEST(s,STR_NOEMBED) && FL_ANY(s,ELTS_SHARED|STR_ASSOC))
  39 #define STR_UNSET_NOCAPA(s) do {\
  40     if (FL_TEST(s,STR_NOEMBED)) FL_UNSET(s,(ELTS_SHARED|STR_ASSOC));\
  41 } while (0)
  42
  43
  44 #define STR_SET_NOEMBED(str) do {\
  45     FL_SET(str, STR_NOEMBED);\
  46     STR_SET_EMBED_LEN(str, 0);\
  47 } while (0)
  48 #define STR_SET_EMBED(str) FL_UNSET(str, STR_NOEMBED)
  49 #define STR_EMBED_P(str) (!FL_TEST(str, STR_NOEMBED))
  50 #define STR_SET_EMBED_LEN(str, n) do { \
  51     long tmp_n = (n);\
  52     RBASIC(str)->flags &= ~RSTRING_EMBED_LEN_MASK;\
  53     RBASIC(str)->flags |= (tmp_n) << RSTRING_EMBED_LEN_SHIFT;\
  54 } while (0)
  55
  56 #define STR_SET_LEN(str, n) do { \
  57     if (STR_EMBED_P(str)) {\
  58         STR_SET_EMBED_LEN(str, n);\
  59     }\
  60     else {\
  61         RSTRING(str)->as.heap.len = (n);\
  62     }\
  63 } while (0)
  64
  65 #define STR_DEC_LEN(str) do {\
  66     if (STR_EMBED_P(str)) {\
  67         long n = RSTRING_LEN(str);\
  68         n--;\
  69         STR_SET_EMBED_LEN(str, n);\
  70     }\
  71     else {\
  72         RSTRING(str)->as.heap.len--;\
  73     }\
  74 } while (0)
  75
  76 #define RESIZE_CAPA(str,capacity) do {\
  77     if (STR_EMBED_P(str)) {\
  78         if ((capacity) > RSTRING_EMBED_LEN_MAX) {\
  79             char *tmp = ALLOC_N(char, capacity+1);\
  80             memcpy(tmp, RSTRING_PTR(str), RSTRING_LEN(str));\
  81             RSTRING(str)->as.heap.ptr = tmp;\
  82             RSTRING(str)->as.heap.len = RSTRING_LEN(str);\
  83             STR_SET_NOEMBED(str);\
  84             RSTRING(str)->as.heap.aux.capa = (capacity);\
  85         }\
  86     }\
  87     else {\
  88         REALLOC_N(RSTRING(str)->as.heap.ptr, char, (capacity)+1);\
  89         if (!STR_NOCAPA_P(str))\
  90             RSTRING(str)->as.heap.aux.capa = (capacity);\
  91     }\
  92 } while (0)
  93
  94 #define is_ascii_string(str) (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT)
  95 #define is_broken_string(str) (rb_enc_str_coderange(str) == ENC_CODERANGE_BROKEN)
  96
  97 #define STR_ENC_GET(str) rb_enc_from_index(ENCODING_GET(str))
  98
  99 static int
 100 single_byte_optimizable(VALUE str)
 101 {
 102     rb_encoding *enc = STR_ENC_GET(str);
 103
 104     if (rb_enc_mbmaxlen(enc) == 1)
 105         return 1;
 106
 107     /* Conservative.  It may be ENC_CODERANGE_UNKNOWN. */
 108     if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT)
 109         return 1;
 110
 111     /* Conservative.  Possibly single byte.
 112      * "\xa1" in Shift_JIS for example. */
 113     return 0;
 114 }
 115
 116 VALUE rb_fs;
 117
 118 static inline const char *
 119 search_nonascii(const char *p, const char *e)
 120 {
 121 #if SIZEOF_VALUE == 8
 122 # define NONASCII_MASK 0x8080808080808080LL
 123 #elif SIZEOF_VALUE == 4
 124 # define NONASCII_MASK 0x80808080UL
 125 #endif
 126 #ifdef NONASCII_MASK
 127     if (sizeof(VALUE) * 2 < e - p) {
 128         const VALUE *s, *t;
 129         const VALUE lowbits = sizeof(VALUE) - 1;
 130         s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
 131         while (p < (const char *)s) {
 132             if (!ISASCII(*p))
 133                 return p;
 134             p++;
 135         }
 136         t = (const VALUE*)(~lowbits & (VALUE)e);
 137         while (s < t) {
 138             if (*s & NONASCII_MASK) {
 139                 t = s;
 140                 break;
 141             }
 142             s++;
 143         }
 144         p = (const char *)t;
 145     }
 146 #endif
 147     while (p < e) {
 148         if (!ISASCII(*p))
 149             return p;
 150         p++;
 151     }
 152     return NULL;
 153 }
 154
 155 static int
 156 coderange_scan(const char *p, long len, rb_encoding *enc)
 157 {
 158     const char *e = p + len;
 159
 160     if (rb_enc_to_index(enc) == 0) {
 161         /* enc is ASCII-8BIT.  ASCII-8BIT string never be broken. */
 162         p = search_nonascii(p, e);
 163         return p ? ENC_CODERANGE_VALID : ENC_CODERANGE_7BIT;
 164     }
 165
 166     if (rb_enc_asciicompat(enc)) {
 167         p = search_nonascii(p, e);
 168         if (!p) {
 169             return ENC_CODERANGE_7BIT;
 170         }
 171         while (p < e) {
 172             int ret = rb_enc_precise_mbclen(p, e, enc);
 173             if (!MBCLEN_CHARFOUND_P(ret)) {
 174                 return ENC_CODERANGE_BROKEN;
 175             }
 176             p += MBCLEN_CHARFOUND_LEN(ret);
 177             if (p < e) {
 178                 p = search_nonascii(p, e);
 179                 if (!p) {
 180                     return ENC_CODERANGE_VALID;
 181                 }
 182             }
 183         }
 184         if (e < p) {
 185             return ENC_CODERANGE_BROKEN;
 186         }
 187         return ENC_CODERANGE_VALID;
 188     }
 189
 190     while (p < e) {
 191         int ret = rb_enc_precise_mbclen(p, e, enc);
 192
 193         if (!MBCLEN_CHARFOUND_P(ret)) {
 194             return ENC_CODERANGE_BROKEN;
 195         }
 196         p += MBCLEN_CHARFOUND_LEN(ret);
 197     }
 198     if (e < p) {
 199         return ENC_CODERANGE_BROKEN;
 200     }
 201     return ENC_CODERANGE_VALID;
 202 }
 203
 204 long
 205 rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
 206 {
 207     const char *p = s;
 208
 209     if (*cr == ENC_CODERANGE_BROKEN)
 210         return e - s;
 211
 212     if (rb_enc_to_index(enc) == 0) {
 213         /* enc is ASCII-8BIT.  ASCII-8BIT string never be broken. */
 214         p = search_nonascii(p, e);
 215         *cr = (!p && *cr != ENC_CODERANGE_VALID) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
 216         return e - s;
 217     }
 218     else if (rb_enc_asciicompat(enc)) {
 219         p = search_nonascii(p, e);
 220         if (!p) {
 221             if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
 222             return e - s;
 223         }
 224         while (p < e) {
 225             int ret = rb_enc_precise_mbclen(p, e, enc);
 226             if (!MBCLEN_CHARFOUND_P(ret)) {
 227                 *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN;
 228                 return p - s;
 229             }
 230             p += MBCLEN_CHARFOUND_LEN(ret);
 231             if (p < e) {
 232                 p = search_nonascii(p, e);
 233                 if (!p) {
 234                     *cr = ENC_CODERANGE_VALID;
 235                     return e - s;
 236                 }
 237             }
 238         }
 239         *cr = e < p ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_VALID;
 240         return p - s;
 241     }
 242     else {
 243         while (p < e) {
 244             int ret = rb_enc_precise_mbclen(p, e, enc);
 245             if (!MBCLEN_CHARFOUND_P(ret)) {
 246                 *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN;
 247                 return p - s;
 248             }
 249             p += MBCLEN_CHARFOUND_LEN(ret);
 250         }
 251         *cr = e < p ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_VALID;
 252         return p - s;
 253     }
 254 }
 255
 256 static inline void
 257 str_enc_copy(VALUE str1, VALUE str2)
 258 {
 259     rb_enc_set_index(str1, ENCODING_GET(str2));
 260 }
 261
 262 static void
 263 rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
 264 {
 265     /* this function is designed for copying encoding and coderange
 266      * from src to new string "dest" which is made from the part of src.
 267      */
 268     str_enc_copy(dest, src);
 269     switch (ENC_CODERANGE(src)) {
 270       case ENC_CODERANGE_7BIT:
 271         ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
 272         break;
 273       case ENC_CODERANGE_VALID:
 274         if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
 275             search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
 276             ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID);
 277         else
 278             ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
 279         break;
 280       default:
 281         if (RSTRING_LEN(dest) == 0) {
 282             if (!rb_enc_asciicompat(STR_ENC_GET(src)))
 283                 ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID);
 284             else
 285                 ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
 286         }
 287         break;
 288     }
 289 }
 290
 291 static void
 292 rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
 293 {
 294     str_enc_copy(dest, src);
 295     ENC_CODERANGE_SET(dest, ENC_CODERANGE(src));
 296 }
 297
 298 int
 299 rb_enc_str_coderange(VALUE str)
 300 {
 301     int cr = ENC_CODERANGE(str);
 302
 303     if (cr == ENC_CODERANGE_UNKNOWN) {
 304         rb_encoding *enc = STR_ENC_GET(str);
 305         cr = coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
 306         ENC_CODERANGE_SET(str, cr);
 307     }
 308     return cr;
 309 }
 310
 311 int
 312 rb_enc_str_asciionly_p(VALUE str)
 313 {
 314     rb_encoding *enc = STR_ENC_GET(str);
 315
 316     if (!rb_enc_asciicompat(enc))
 317         return Qfalse;
 318     else if (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT)
 319         return Qtrue;
 320     return Qfalse;
 321 }
 322
 323 static inline void
 324 str_mod_check(VALUE s, const char *p, long len)
 325 {
 326     if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
 327         rb_raise(rb_eRuntimeError, "string modified");
 328     }
 329 }
 330
 331 static inline void
 332 str_frozen_check(VALUE s)
 333 {
 334     if (OBJ_FROZEN(s)) {
 335         rb_raise(rb_eRuntimeError, "string frozen");
 336     }
 337 }
 338
 339 static VALUE
 340 str_alloc(VALUE klass)
 341 {
 342     NEWOBJ(str, struct RString);
 343     OBJSETUP(str, klass, T_STRING);
 344
 345     if (klass == rb_cSymbol) {
 346         /* need to be registered in table */
 347         RBASIC(str)->klass = rb_cString;
 348     }
 349     str->as.heap.ptr = 0;
 350     str->as.heap.len = 0;
 351     str->as.heap.aux.capa = 0;
 352
 353     return (VALUE)str;
 354 }
 355
 356 static VALUE
 357 str_new(VALUE klass, const char *ptr, long len)
 358 {
 359     VALUE str;
 360
 361     if (len < 0) {
 362         rb_raise(rb_eArgError, "negative string size (or size too big)");
 363     }
 364
 365     str = str_alloc(klass);
 366     if (len > RSTRING_EMBED_LEN_MAX) {
 367         RSTRING(str)->as.heap.aux.capa = len;
 368         RSTRING(str)->as.heap.ptr = ALLOC_N(char,len+1);
 369         STR_SET_NOEMBED(str);
 370     }
 371     if (ptr) {
 372         memcpy(RSTRING_PTR(str), ptr, len);
 373     }
 374     STR_SET_LEN(str, len);
 375     RSTRING_PTR(str)[len] = '\0';
 376     return str;
 377 }
 378
 379 VALUE
 380 rb_str_new(const char *ptr, long len)
 381 {
 382     return str_new(rb_cString, ptr, len);
 383 }
 384
 385 VALUE
 386 rb_usascii_str_new(const char *ptr, long len)
 387 {
 388     VALUE str = rb_str_new(ptr, len);
 389     ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
 390     return str;
 391 }
 392
 393 VALUE
 394 rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
 395 {
 396     VALUE str = rb_str_new(ptr, len);
 397     rb_enc_associate(str, enc);
 398     return str;
 399 }
 400
 401 VALUE
 402 rb_str_new2(const char *ptr)
 403 {
 404     if (!ptr) {
 405         rb_raise(rb_eArgError, "NULL pointer given");
 406     }
 407     return rb_str_new(ptr, strlen(ptr));
 408 }
 409
 410 VALUE
 411 rb_usascii_str_new2(const char *ptr)
 412 {
 413     VALUE str = rb_str_new2(ptr);
 414     ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
 415     return str;
 416 }
 417
 418 VALUE
 419 rb_tainted_str_new(const char *ptr, long len)
 420 {
 421     VALUE str = rb_str_new(ptr, len);
 422
 423     OBJ_TAINT(str);
 424     return str;
 425 }
 426
 427 VALUE
 428 rb_tainted_str_new2(const char *ptr)
 429 {
 430     VALUE str = rb_str_new2(ptr);
 431
 432     OBJ_TAINT(str);
 433     return str;
 434 }
 435
 436 static VALUE
 437 str_replace_shared(VALUE str2, VALUE str)
 438 {
 439     if (RSTRING_LEN(str) <= RSTRING_EMBED_LEN_MAX) {
 440         STR_SET_EMBED(str2);
 441         memcpy(RSTRING_PTR(str2), RSTRING_PTR(str), RSTRING_LEN(str)+1);
 442         STR_SET_EMBED_LEN(str2, RSTRING_LEN(str));
 443     }
 444     else {
 445         FL_SET(str2, STR_NOEMBED);
 446         RSTRING(str2)->as.heap.len = RSTRING_LEN(str);
 447         RSTRING(str2)->as.heap.ptr = RSTRING_PTR(str);
 448         RSTRING(str2)->as.heap.aux.shared = str;
 449         FL_SET(str2, ELTS_SHARED);
 450     }
 451     rb_enc_cr_str_exact_copy(str2, str);
 452
 453     return str2;
 454 }
 455
 456 static VALUE
 457 str_new_shared(VALUE klass, VALUE str)
 458 {
 459     return str_replace_shared(str_alloc(klass), str);
 460 }
 461
 462 static VALUE
 463 str_new3(VALUE klass, VALUE str)
 464 {
 465     return str_new_shared(klass, str);
 466 }
 467
 468 VALUE
 469 rb_str_new3(VALUE str)
 470 {
 471     VALUE str2 = str_new3(rb_obj_class(str), str);
 472
 473     OBJ_INFECT(str2, str);
 474     return str2;
 475 }
 476
 477 static VALUE
 478 str_new4(VALUE klass, VALUE str)
 479 {
 480     VALUE str2;
 481
 482     str2 = str_alloc(klass);
 483     STR_SET_NOEMBED(str2);
 484     RSTRING(str2)->as.heap.len = RSTRING_LEN(str);
 485     RSTRING(str2)->as.heap.ptr = RSTRING_PTR(str);
 486     if (STR_SHARED_P(str)) {
 487         FL_SET(str2, ELTS_SHARED);
 488         RSTRING(str2)->as.heap.aux.shared = RSTRING(str)->as.heap.aux.shared;
 489     }
 490     else {
 491         FL_SET(str, ELTS_SHARED);
 492         RSTRING(str)->as.heap.aux.shared = str2;
 493     }
 494     rb_enc_cr_str_exact_copy(str2, str);
 495     OBJ_INFECT(str2, str);
 496     return str2;
 497 }
 498
 499 VALUE
 500 rb_str_new4(VALUE orig)
 501 {
 502     VALUE klass, str;
 503
 504     if (OBJ_FROZEN(orig)) return orig;
 505     klass = rb_obj_class(orig);
 506     if (STR_SHARED_P(orig) && (str = RSTRING(orig)->as.heap.aux.shared)) {
 507         long ofs;
 508         ofs = RSTRING_LEN(str) - RSTRING_LEN(orig);
 509         if ((ofs > 0) || (klass != RBASIC(str)->klass) ||
 510             (!OBJ_TAINTED(str) && OBJ_TAINTED(orig))) {
 511             str = str_new3(klass, str);
 512             RSTRING(str)->as.heap.ptr += ofs;
 513             RSTRING(str)->as.heap.len -= ofs;
 514         }
 515         rb_enc_cr_str_exact_copy(str, orig);
 516         OBJ_INFECT(str, orig);
 517     }
 518     else if (STR_EMBED_P(orig)) {
 519         str = str_new(klass, RSTRING_PTR(orig), RSTRING_LEN(orig));
 520         rb_enc_cr_str_exact_copy(str, orig);
 521         OBJ_INFECT(str, orig);
 522     }
 523     else if (STR_ASSOC_P(orig)) {
 524         VALUE assoc = RSTRING(orig)->as.heap.aux.shared;
 525         FL_UNSET(orig, STR_ASSOC);
 526         str = str_new4(klass, orig);
 527         FL_SET(str, STR_ASSOC);
 528         RSTRING(str)->as.heap.aux.shared = assoc;
 529     }
 530     else {
 531         str = str_new4(klass, orig);
 532     }
 533     OBJ_FREEZE(str);
 534     return str;
 535 }
 536
 537 VALUE
 538 rb_str_new5(VALUE obj, const char *ptr, long len)
 539 {
 540     return str_new(rb_obj_class(obj), ptr, len);
 541 }
 542
 543 #define STR_BUF_MIN_SIZE 128
 544
 545 VALUE
 546 rb_str_buf_new(long capa)
 547 {
 548     VALUE str = str_alloc(rb_cString);
 549
 550     if (capa < STR_BUF_MIN_SIZE) {
 551         capa = STR_BUF_MIN_SIZE;
 552     }
 553     FL_SET(str, STR_NOEMBED);
 554     RSTRING(str)->as.heap.aux.capa = capa;
 555     RSTRING(str)->as.heap.ptr = ALLOC_N(char, capa+1);
 556     RSTRING(str)->as.heap.ptr[0] = '\0';
 557
 558     return str;
 559 }
 560
 561 VALUE
 562 rb_str_buf_new2(const char *ptr)
 563 {
 564     VALUE str;
 565     long len = strlen(ptr);
 566
 567     str = rb_str_buf_new(len);
 568     rb_str_buf_cat(str, ptr, len);
 569
 570     return str;
 571 }
 572
 573 VALUE
 574 rb_str_tmp_new(long len)
 575 {
 576     return str_new(0, 0, len);
 577 }
 578
 579 void
 580 rb_str_free(VALUE str)
 581 {
 582     if (!STR_EMBED_P(str) && !STR_SHARED_P(str)) {
 583         xfree(RSTRING(str)->as.heap.ptr);
 584     }
 585 }
 586
 587 VALUE
 588 rb_str_to_str(VALUE str)
 589 {
 590     return rb_convert_type(str, T_STRING, "String", "to_str");
 591 }
 592
 593 void
 594 rb_str_shared_replace(VALUE str, VALUE str2)
 595 {
 596     rb_encoding *enc;
 597     int cr;
 598     if (str == str2) return;
 599     enc = STR_ENC_GET(str2);
 600     cr = ENC_CODERANGE(str2);
 601     rb_str_modify(str);
 602     if (OBJ_TAINTED(str2)) OBJ_TAINT(str);
 603     if (RSTRING_LEN(str2) <= RSTRING_EMBED_LEN_MAX) {
 604         STR_SET_EMBED(str);
 605         memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), RSTRING_LEN(str2)+1);
 606         STR_SET_EMBED_LEN(str, RSTRING_LEN(str2));
 607         rb_enc_associate(str, enc);
 608         ENC_CODERANGE_SET(str, cr);
 609         return;
 610     }
 611     if (!STR_SHARED_P(str) && !STR_EMBED_P(str)) {
 612         xfree(RSTRING_PTR(str));
 613     }
 614     STR_SET_NOEMBED(str);
 615     STR_UNSET_NOCAPA(str);
 616     RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
 617     RSTRING(str)->as.heap.len = RSTRING_LEN(str2);
 618     if (STR_NOCAPA_P(str2)) {
 619         FL_SET(str, RBASIC(str2)->flags & STR_NOCAPA);
 620         RSTRING(str)->as.heap.aux.shared = RSTRING(str2)->as.heap.aux.shared;
 621     }
 622     else {
 623         RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
 624     }
 625     RSTRING(str2)->as.heap.ptr = 0;     /* abandon str2 */
 626     RSTRING(str2)->as.heap.len = 0;
 627     RSTRING(str2)->as.heap.aux.capa = 0;
 628     STR_UNSET_NOCAPA(str2);
 629     rb_enc_associate(str, enc);
 630     ENC_CODERANGE_SET(str, cr);
 631 }
 632
 633 static ID id_to_s;
 634
 635 VALUE
 636 rb_obj_as_string(VALUE obj)
 637 {
 638     VALUE str;
 639
 640     if (TYPE(obj) == T_STRING) {
 641         return obj;
 642     }
 643     str = rb_funcall(obj, id_to_s, 0);
 644     if (TYPE(str) != T_STRING)
 645         return rb_any_to_s(obj);
 646     if (OBJ_TAINTED(obj)) OBJ_TAINT(str);
 647     return str;
 648 }
 649
 650 static VALUE rb_str_replace(VALUE, VALUE);
 651
 652 VALUE
 653 rb_str_dup(VALUE str)
 654 {
 655     VALUE dup = str_alloc(rb_obj_class(str));
 656     rb_str_replace(dup, str);
 657     return dup;
 658 }
 659
 660
 661 /*
 662  *  call-seq:
 663  *     String.new(str="")   => new_str
 664  *
 665  *  Returns a new string object containing a copy of <i>str</i>.
 666  */
 667
 668 static VALUE
 669 rb_str_init(int argc, VALUE *argv, VALUE str)
 670 {
 671     VALUE orig;
 672
 673     if (argc > 0 && rb_scan_args(argc, argv, "01", &orig) == 1)
 674         rb_str_replace(str, orig);
 675     return str;
 676 }
 677
 678 long
 679 rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
 680 {
 681     long c;
 682     const char *q;
 683
 684     if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
 685         return (e - p + rb_enc_mbminlen(enc) - 1) / rb_enc_mbminlen(enc);
 686     }
 687     else if (rb_enc_asciicompat(enc)) {
 688         c = 0;
 689         while (p < e) {
 690             if (ISASCII(*p)) {
 691                 q = search_nonascii(p, e);
 692                 if (!q)
 693                     return c + (e - p);
 694                 c += q - p;
 695                 p = q;
 696             }
 697             p += rb_enc_mbclen(p, e, enc);
 698             c++;
 699         }
 700         return c;
 701     }
 702
 703     for (c=0; p<e; c++) {
 704         p += rb_enc_mbclen(p, e, enc);
 705     }
 706     return c;
 707 }
 708
 709 long
 710 rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
 711 {
 712     long c;
 713     const char *q;
 714     int ret;
 715
 716     *cr = 0;
 717     if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
 718         return (e - p + rb_enc_mbminlen(enc) - 1) / rb_enc_mbminlen(enc);
 719     }
 720     else if (rb_enc_asciicompat(enc)) {
 721         c = 0;
 722         while (p < e) {
 723             if (ISASCII(*p)) {
 724                 q = search_nonascii(p, e);
 725                 if (!q) {
 726                     if (!*cr) *cr = ENC_CODERANGE_7BIT;
 727                     return c + (e - p);
 728                 }
 729                 c += q - p;
 730                 p = q;
 731             }
 732             ret = rb_enc_precise_mbclen(p, e, enc);
 733             if (MBCLEN_CHARFOUND_P(ret)) {
 734                 *cr |= ENC_CODERANGE_VALID;
 735                 p += MBCLEN_CHARFOUND_LEN(ret);
 736             }
 737             else {
 738                 *cr = ENC_CODERANGE_BROKEN;
 739                 p++;
 740             }
 741             c++;
 742         }
 743         if (!*cr) *cr = ENC_CODERANGE_7BIT;
 744         return c;
 745     }
 746
 747     for (c=0; p<e; c++) {
 748         ret = rb_enc_precise_mbclen(p, e, enc);
 749         if (MBCLEN_CHARFOUND_P(ret)) {
 750             *cr |= ENC_CODERANGE_VALID;
 751             p += MBCLEN_CHARFOUND_LEN(ret);
 752         }
 753         else {
 754             *cr = ENC_CODERANGE_BROKEN;
 755             p++;
 756         }
 757     }
 758     if (!*cr) *cr = ENC_CODERANGE_7BIT;
 759     return c;
 760 }
 761
 762 #ifdef NONASCII_MASK
 763 #define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
 764 static inline VALUE
 765 count_utf8_lead_bytes_with_word(const VALUE *s)
 766 {
 767     VALUE d = *s;
 768     d |= ~(d>>1);
 769     d >>= 6;
 770     d &= NONASCII_MASK >> 7;
 771     d += (d>>8);
 772     d += (d>>16);
 773 #if SIZEOF_VALUE == 8
 774     d += (d>>32);
 775 #endif
 776     return (d&0xF);
 777 }
 778 #endif
 779
 780 static long
 781 str_strlen(VALUE str, rb_encoding *enc)
 782 {
 783     const char *p, *e;
 784     int n, cr;
 785
 786     if (single_byte_optimizable(str)) return RSTRING_LEN(str);
 787     if (!enc) enc = STR_ENC_GET(str);
 788     p = RSTRING_PTR(str);
 789     e = RSTRING_END(str);
 790 #ifdef NONASCII_MASK
 791     if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
 792         enc == rb_utf8_encoding()) {
 793         VALUE len = 0;
 794         if (sizeof(VALUE) * 2 < e - p) {
 795             const VALUE *s, *t;
 796             const VALUE lowbits = sizeof(VALUE) - 1;
 797             s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
 798             t = (const VALUE*)(~lowbits & (VALUE)e);
 799             while (p < (const char *)s) {
 800                 if (is_utf8_lead_byte(*p)) len++;
 801                 p++;
 802             }
 803             while (s < t) {
 804                 len += count_utf8_lead_bytes_with_word(s);
 805                 s++;
 806             }
 807             p = (const char *)s;
 808         }
 809         while (p < e) {
 810             if (is_utf8_lead_byte(*p)) len++;
 811             p++;
 812         }
 813         return (long)len;
 814     }
 815 #endif
 816     n = rb_enc_strlen_cr(p, e, enc, &cr);
 817     if (cr) {
 818         ENC_CODERANGE_SET(str, cr);
 819     }
 820     return n;
 821 }
 822
 823 /*
 824  *  call-seq:
 825  *     str.length   => integer
 826  *     str.size     => integer
 827  *
 828  *  Returns the character length of <i>str</i>.
 829  */
 830
 831 VALUE
 832 rb_str_length(VALUE str)
 833 {
 834     int len;
 835
 836     len = str_strlen(str, STR_ENC_GET(str));
 837     return INT2NUM(len);
 838 }
 839
 840 /*
 841  *  call-seq:
 842  *     str.bytesize  => integer
 843  *
 844  *  Returns the length of <i>str</i> in bytes.
 845  */
 846
 847 static VALUE
 848 rb_str_bytesize(VALUE str)
 849 {
 850     return INT2NUM(RSTRING_LEN(str));
 851 }
 852
 853 /*
 854  *  call-seq:
 855  *     str.empty?   => true or false
 856  *
 857  *  Returns <code>true</code> if <i>str</i> has a length of zero.
 858  *
 859  *     "hello".empty?   #=> false
 860  *     "".empty?        #=> true
 861  */
 862
 863 static VALUE
 864 rb_str_empty(VALUE str)
 865 {
 866     if (RSTRING_LEN(str) == 0)
 867         return Qtrue;
 868     return Qfalse;
 869 }
 870
 871 /*
 872  *  call-seq:
 873  *     str + other_str   => new_str
 874  *
 875  *  Concatenation---Returns a new <code>String</code> containing
 876  *  <i>other_str</i> concatenated to <i>str</i>.
 877  *
 878  *     "Hello from " + self.to_s   #=> "Hello from main"
 879  */
 880
 881 VALUE
 882 rb_str_plus(VALUE str1, VALUE str2)
 883 {
 884     VALUE str3;
 885     rb_encoding *enc;
 886
 887     StringValue(str2);
 888     enc = rb_enc_check(str1, str2);
 889     str3 = rb_str_new(0, RSTRING_LEN(str1)+RSTRING_LEN(str2));
 890     memcpy(RSTRING_PTR(str3), RSTRING_PTR(str1), RSTRING_LEN(str1));
 891     memcpy(RSTRING_PTR(str3) + RSTRING_LEN(str1),
 892            RSTRING_PTR(str2), RSTRING_LEN(str2));
 893     RSTRING_PTR(str3)[RSTRING_LEN(str3)] = '\0';
 894
 895     if (OBJ_TAINTED(str1) || OBJ_TAINTED(str2))
 896         OBJ_TAINT(str3);
 897     ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc),
 898                            ENC_CODERANGE_AND(ENC_CODERANGE(str1), ENC_CODERANGE(str2)));
 899     return str3;
 900 }
 901
 902 /*
 903  *  call-seq:
 904  *     str * integer   => new_str
 905  *
 906  *  Copy---Returns a new <code>String</code> containing <i>integer</i> copies of
 907  *  the receiver.
 908  *
 909  *     "Ho! " * 3   #=> "Ho! Ho! Ho! "
 910  */
 911
 912 VALUE
 913 rb_str_times(VALUE str, VALUE times)
 914 {
 915     VALUE str2;
 916     long n, len;
 917
 918     len = NUM2LONG(times);
 919     if (len < 0) {
 920         rb_raise(rb_eArgError, "negative argument");
 921     }
 922     if (len && LONG_MAX/len <  RSTRING_LEN(str)) {
 923         rb_raise(rb_eArgError, "argument too big");
 924     }
 925
 926     str2 = rb_str_new5(str, 0, len *= RSTRING_LEN(str));
 927     if (len) {
 928         n = RSTRING_LEN(str);
 929         memcpy(RSTRING_PTR(str2), RSTRING_PTR(str), n);
 930         while (n <= len/2) {
 931             memcpy(RSTRING_PTR(str2) + n, RSTRING_PTR(str2), n);
 932             n *= 2;
 933         }
 934         memcpy(RSTRING_PTR(str2) + n, RSTRING_PTR(str2), len-n);
 935     }
 936     RSTRING_PTR(str2)[RSTRING_LEN(str2)] = '\0';
 937     OBJ_INFECT(str2, str);
 938     rb_enc_cr_str_copy_for_substr(str2, str);
 939
 940     return str2;
 941 }
 942
 943 /*
 944  *  call-seq:
 945  *     str % arg   => new_str
 946  *
 947  *  Format---Uses <i>str</i> as a format specification, and returns the result
 948  *  of applying it to <i>arg</i>. If the format specification contains more than
 949  *  one substitution, then <i>arg</i> must be an <code>Array</code> containing
 950  *  the values to be substituted. See <code>Kernel::sprintf</code> for details
 951  *  of the format string.
 952  *
 953  *     "%05d" % 123                              #=> "00123"
 954  *     "%-5s: %08x" % [ "ID", self.object_id ]   #=> "ID   : 200e14d6"
 955  */
 956
 957 static VALUE
 958 rb_str_format_m(VALUE str, VALUE arg)
 959 {
 960     VALUE tmp = rb_check_array_type(arg);
 961
 962     if (!NIL_P(tmp)) {
 963         return rb_str_format(RARRAY_LEN(tmp), RARRAY_PTR(tmp), str);
 964     }
 965     return rb_str_format(1, &arg, str);
 966 }
 967
 968 static inline void
 969 str_modifiable(VALUE str)
 970 {
 971     if (FL_TEST(str, STR_TMPLOCK)) {
 972         rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
 973     }
 974     if (OBJ_FROZEN(str)) rb_error_frozen("string");
 975     if (!OBJ_TAINTED(str) && rb_safe_level() >= 4)
 976         rb_raise(rb_eSecurityError, "Insecure: can't modify string");
 977 }
 978
 979 static inline int
 980 str_independent(VALUE str)
 981 {
 982     str_modifiable(str);
 983     if (!STR_SHARED_P(str)) return 1;
 984     if (STR_EMBED_P(str)) return 1;
 985     return 0;
 986 }
 987
 988 static void
 989 str_make_independent(VALUE str)
 990 {
 991     char *ptr;
 992     long len = RSTRING_LEN(str);
 993
 994     ptr = ALLOC_N(char, len+1);
 995     if (RSTRING_PTR(str)) {
 996         memcpy(ptr, RSTRING_PTR(str), len);
 997     }
 998     STR_SET_NOEMBED(str);
 999     ptr[len] = 0;
1000     RSTRING(str)->as.heap.ptr = ptr;
1001     RSTRING(str)->as.heap.len = len;
1002     RSTRING(str)->as.heap.aux.capa = len;
1003     STR_UNSET_NOCAPA(str);
1004 }
1005
1006 void
1007 rb_str_modify(VALUE str)
1008 {
1009     if (!str_independent(str))
1010         str_make_independent(str);
1011     ENC_CODERANGE_CLEAR(str);
1012 }
1013
1014 void
1015 rb_str_associate(VALUE str, VALUE add)
1016 {
1017     /* sanity check */
1018     if (OBJ_FROZEN(str)) rb_error_frozen("string");
1019     if (STR_ASSOC_P(str)) {
1020         /* already associated */
1021         rb_ary_concat(RSTRING(str)->as.heap.aux.shared, add);
1022     }
1023     else {
1024         if (STR_SHARED_P(str)) {
1025             VALUE assoc = RSTRING(str)->as.heap.aux.shared;
1026             str_make_independent(str);
1027             if (STR_ASSOC_P(assoc)) {
1028                 assoc = RSTRING(assoc)->as.heap.aux.shared;
1029                 rb_ary_concat(assoc, add);
1030                 add = assoc;
1031             }
1032         }
1033         else if (STR_EMBED_P(str)) {
1034             str_make_independent(str);
1035         }
1036         else if (RSTRING(str)->as.heap.aux.capa != RSTRING_LEN(str)) {
1037             RESIZE_CAPA(str, RSTRING_LEN(str));
1038         }
1039         FL_SET(str, STR_ASSOC);
1040         RBASIC(add)->klass = 0;
1041         RSTRING(str)->as.heap.aux.shared = add;
1042     }
1043 }
1044
1045 VALUE
1046 rb_str_associated(VALUE str)
1047 {
1048     if (STR_SHARED_P(str)) str = RSTRING(str)->as.heap.aux.shared;
1049     if (STR_ASSOC_P(str)) {
1050         return RSTRING(str)->as.heap.aux.shared;
1051     }
1052     return Qfalse;
1053 }
1054
1055 VALUE
1056 rb_string_value(volatile VALUE *ptr)
1057 {
1058     VALUE s = *ptr;
1059     if (TYPE(s) != T_STRING) {
1060         s = rb_str_to_str(s);
1061         *ptr = s;
1062     }
1063     return s;
1064 }
1065
1066 char *
1067 rb_string_value_ptr(volatile VALUE *ptr)
1068 {
1069     return RSTRING_PTR(rb_string_value(ptr));
1070 }
1071
1072 char *
1073 rb_string_value_cstr(volatile VALUE *ptr)
1074 {
1075     VALUE str = rb_string_value(ptr);
1076     char *s = RSTRING_PTR(str);
1077
1078     if (!s || RSTRING_LEN(str) != strlen(s)) {
1079         rb_raise(rb_eArgError, "string contains null byte");
1080     }
1081     return s;
1082 }
1083
1084 VALUE
1085 rb_check_string_type(VALUE str)
1086 {
1087     str = rb_check_convert_type(str, T_STRING, "String", "to_str");
1088     return str;
1089 }
1090
1091 /*
1092  *  call-seq:
1093  *     String.try_convert(obj) -> string or nil
1094  *
1095  *  Try to convert <i>obj</i> into a String, using to_str method.
1096  *  Returns converted regexp or nil if <i>obj</i> cannot be converted
1097  *  for any reason.
1098  *
1099  *     String.try_convert("str")     # => str
1100  *     String.try_convert(/re/)      # => nil
1101  */
1102 static VALUE
1103 rb_str_s_try_convert(VALUE dummy, VALUE str)
1104 {
1105     return rb_check_string_type(str);
1106 }
1107
1108 char*
1109 rb_enc_nth(const char *p, const char *e, int nth, rb_encoding *enc)
1110 {
1111     if (rb_enc_mbmaxlen(enc) == 1) {
1112         p += nth;
1113     }
1114     else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
1115         p += nth * rb_enc_mbmaxlen(enc);
1116     }
1117     else if (rb_enc_asciicompat(enc)) {
1118         const char *p2, *e2;
1119         int n;
1120
1121         while (p < e && 0 < nth) {
1122             e2 = p + nth;
1123             if (e < e2)
1124                 return (char *)e;
1125             if (ISASCII(*p)) {
1126                 p2 = search_nonascii(p, e2);
1127                 if (!p2)
1128                     return (char *)e2;
1129                 nth -= p2 - p;
1130                 p = p2;
1131             }
1132             n = rb_enc_mbclen(p, e, enc);
1133             p += n;
1134             nth--;
1135         }
1136         if (nth != 0)
1137             return (char *)e;
1138         return (char *)p;
1139     }
1140     else {
1141         while (p<e && nth--) {
1142             p += rb_enc_mbclen(p, e, enc);
1143         }
1144     }
1145     if (p > e) p = e;
1146     return (char*)p;
1147 }
1148
1149 static char*
1150 str_nth(const char *p, const char *e, int nth, rb_encoding *enc, int singlebyte)
1151 {
1152     if (singlebyte)
1153         p += nth;
1154     else {
1155         p = rb_enc_nth(p, e, nth, enc);
1156     }
1157     if (!p) return 0;
1158     if (p > e) p = e;
1159     return (char *)p;
1160 }
1161
1162 /* char offset to byte offset */
1163 static int
1164 str_offset(const char *p, const char *e, int nth, rb_encoding *enc, int singlebyte)
1165 {
1166     const char *pp = str_nth(p, e, nth, enc, singlebyte);
1167     if (!pp) return e - p;
1168     return pp - p;
1169 }
1170
1171 #ifdef NONASCII_MASK
1172 static char *
1173 str_utf8_nth(const char *p, const char *e, int nth)
1174 {
1175     if (sizeof(VALUE) * 2 < nth) {
1176         const VALUE *s, *t;
1177         const VALUE lowbits = sizeof(VALUE) - 1;
1178         s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
1179         t = (const VALUE*)(~lowbits & (VALUE)e);
1180         while (p < (const char *)s) {
1181             if (is_utf8_lead_byte(*p)) nth--;
1182             p++;
1183         }
1184         do {
1185             nth -= count_utf8_lead_bytes_with_word(s);
1186             s++;
1187         } while (s < t && sizeof(VALUE) <= nth);
1188         p = (char *)s;
1189     }
1190     while (p < e) {
1191         if (is_utf8_lead_byte(*p)) {
1192             if (nth == 0) break;
1193             nth--;
1194         }
1195         p++;
1196     }
1197     return (char *)p;
1198 }
1199
1200 static int
1201 str_utf8_offset(const char *p, const char *e, int nth)
1202 {
1203     const char *pp = str_utf8_nth(p, e, nth);
1204     if (!pp) return e - p;
1205     return pp - p;
1206 }
1207 #endif
1208
1209 /* byte offset to char offset */
1210 long
1211 rb_str_sublen(VALUE str, long pos)
1212 {
1213     if (single_byte_optimizable(str) || pos < 0)
1214         return pos;
1215     else {
1216         char *p = RSTRING_PTR(str);
1217         return rb_enc_strlen(p, p + pos, STR_ENC_GET(str));
1218     }
1219 }
1220
1221 VALUE
1222 rb_str_subseq(VALUE str, long beg, long len)
1223 {
1224     VALUE str2 = rb_str_new5(str, RSTRING_PTR(str)+beg, len);
1225
1226     rb_enc_cr_str_copy_for_substr(str2, str);
1227     OBJ_INFECT(str2, str);
1228
1229     return str2;
1230 }
1231
1232 VALUE
1233 rb_str_substr(VALUE str, long beg, long len)
1234 {
1235     rb_encoding *enc = STR_ENC_GET(str);
1236     VALUE str2;
1237     char *p, *s = RSTRING_PTR(str), *e = s + RSTRING_LEN(str);
1238     int singlebyte;
1239
1240     if (len < 0) return Qnil;
1241     if (!RSTRING_LEN(str)) {
1242         len = 0;
1243     }
1244     if (beg < 0) {
1245         if (len > -beg) len = -beg;
1246         if (-beg * rb_enc_mbmaxlen(enc) < RSTRING_LEN(str) / 8) {
1247             beg = -beg;
1248             while (beg-- > len && (e = rb_enc_prev_char(s, e, enc)) != 0);
1249             p = e;
1250             if (!p) return Qnil;
1251             while (len-- > 0 && (p = rb_enc_prev_char(s, p, enc)) != 0);
1252             if (!p) return Qnil;
1253             len = e - p;
1254             goto sub;
1255         }
1256         else {
1257             beg += str_strlen(str, enc);
1258             if (beg < 0) return Qnil;
1259         }
1260     }
1261     else if (beg > 0 && beg > str_strlen(str, enc)) {
1262         return Qnil;
1263     }
1264     singlebyte = single_byte_optimizable(str);
1265     if (len == 0) {
1266         p = 0;
1267     }
1268 #ifdef NONASCII_MASK
1269     else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
1270         enc == rb_utf8_encoding()) {
1271         p = str_utf8_nth(s, e, beg);
1272         len = str_utf8_offset(p, e, len);
1273     }
1274 #endif
1275     else if ((p = str_nth(s, e, beg, enc, singlebyte)) == e) {
1276         len = 0;
1277     }
1278     else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
1279         if (len * rb_enc_mbmaxlen(enc) > e - p)
1280             len = e - p;
1281         else
1282             len *= rb_enc_mbmaxlen(enc);
1283     }
1284     else {
1285         len = str_offset(p, e, len, enc, singlebyte);
1286     }
1287   sub:
1288     if (len > RSTRING_EMBED_LEN_MAX && beg + len == RSTRING_LEN(str)) {
1289         str2 = rb_str_new4(str);
1290         str2 = str_new3(rb_obj_class(str2), str2);
1291         RSTRING(str2)->as.heap.ptr += RSTRING(str2)->as.heap.len - len;
1292         RSTRING(str2)->as.heap.len = len;
1293     }
1294     else {
1295         str2 = rb_str_new5(str, p, len);
1296         rb_enc_cr_str_copy_for_substr(str2, str);
1297         OBJ_INFECT(str2, str);
1298     }
1299
1300     return str2;
1301 }
1302
1303 VALUE
1304 rb_str_freeze(VALUE str)
1305 {
1306     if (STR_ASSOC_P(str)) {
1307         VALUE ary = RSTRING(str)->as.heap.aux.shared;
1308         OBJ_FREEZE(ary);
1309     }
1310     return rb_obj_freeze(str);
1311 }
1312
1313 VALUE
1314 rb_str_dup_frozen(VALUE str)
1315 {
1316     if (STR_SHARED_P(str) && RSTRING(str)->as.heap.aux.shared) {
1317         VALUE shared = RSTRING(str)->as.heap.aux.shared;
1318         if (RSTRING_LEN(shared) == RSTRING_LEN(str)) {
1319             OBJ_FREEZE(shared);
1320             return shared;
1321         }
1322     }
1323     if (OBJ_FROZEN(str)) return str;
1324     str = rb_str_dup(str);
1325     OBJ_FREEZE(str);
1326     return str;
1327 }
1328
1329 VALUE
1330 rb_str_locktmp(VALUE str)
1331 {
1332     if (FL_TEST(str, STR_TMPLOCK)) {
1333         rb_raise(rb_eRuntimeError, "temporal locking already locked string");
1334     }
1335     FL_SET(str, STR_TMPLOCK);
1336     return str;
1337 }
1338
1339 VALUE
1340 rb_str_unlocktmp(VALUE str)
1341 {
1342     if (!FL_TEST(str, STR_TMPLOCK)) {
1343         rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
1344     }
1345     FL_UNSET(str, STR_TMPLOCK);
1346     return str;
1347 }
1348
1349 void
1350 rb_str_set_len(VALUE str, long len)
1351 {
1352     STR_SET_LEN(str, len);
1353     RSTRING_PTR(str)[len] = '\0';
1354 }
1355
1356 VALUE
1357 rb_str_resize(VALUE str, long len)
1358 {
1359     long slen;
1360
1361     if (len < 0) {
1362         rb_raise(rb_eArgError, "negative string size (or size too big)");
1363     }
1364
1365     rb_str_modify(str);
1366     slen = RSTRING_LEN(str);
1367     if (len != slen) {
1368         if (STR_EMBED_P(str)) {
1369             char *ptr;
1370             if (len <= RSTRING_EMBED_LEN_MAX) {
1371                 STR_SET_EMBED_LEN(str, len);
1372                 RSTRING(str)->as.ary[len] = '\0';
1373                 return str;
1374             }
1375             ptr = ALLOC_N(char,len+1);
1376             MEMCPY(ptr, RSTRING(str)->as.ary, char, slen);
1377             RSTRING(str)->as.heap.ptr = ptr;
1378             STR_SET_NOEMBED(str);
1379         }
1380         else if (len <= RSTRING_EMBED_LEN_MAX) {
1381             char *ptr = RSTRING(str)->as.heap.ptr;
1382             STR_SET_EMBED(str);
1383             if (slen > 0) MEMCPY(RSTRING(str)->as.ary, ptr, char, len);
1384             RSTRING(str)->as.ary[len] = '\0';
1385             STR_SET_EMBED_LEN(str, len);
1386             xfree(ptr);
1387             return str;
1388         }
1389         else if (slen < len || slen - len > 1024) {
1390             REALLOC_N(RSTRING(str)->as.heap.ptr, char, len+1);
1391         }
1392         if (!STR_NOCAPA_P(str)) {
1393             RSTRING(str)->as.heap.aux.capa = len;
1394         }
1395         RSTRING(str)->as.heap.len = len;
1396         RSTRING(str)->as.heap.ptr[len] = '\0';  /* sentinel */
1397     }
1398     return str;
1399 }
1400
1401 VALUE
1402 rb_str_buf_cat(VALUE str, const char *ptr, long len)
1403 {
1404     long capa, total;
1405
1406     if (len == 0) return str;
1407     if (len < 0) {
1408         rb_raise(rb_eArgError, "negative string size (or size too big)");
1409     }
1410     rb_str_modify(str);
1411     if (STR_ASSOC_P(str)) {
1412         FL_UNSET(str, STR_ASSOC);
1413         capa = RSTRING(str)->as.heap.aux.capa = RSTRING_LEN(str);
1414     }
1415     else if (STR_EMBED_P(str)) {
1416         capa = RSTRING_EMBED_LEN_MAX;
1417     }
1418     else {
1419         capa = RSTRING(str)->as.heap.aux.capa;
1420     }
1421     total = RSTRING_LEN(str)+len;
1422     if (capa <= total) {
1423         while (total > capa) {
1424             capa = (capa + 1) * 2;
1425         }
1426         RESIZE_CAPA(str, capa);
1427     }
1428     memcpy(RSTRING_PTR(str) + RSTRING_LEN(str), ptr, len);
1429     STR_SET_LEN(str, total);
1430     RSTRING_PTR(str)[total] = '\0'; /* sentinel */
1431
1432     return str;
1433 }
1434
1435 VALUE
1436 rb_str_buf_cat2(VALUE str, const char *ptr)
1437 {
1438     return rb_str_buf_cat(str, ptr, strlen(ptr));
1439 }
1440
1441 VALUE
1442 rb_str_cat(VALUE str, const char *ptr, long len)
1443 {
1444     if (len < 0) {
1445         rb_raise(rb_eArgError, "negative string size (or size too big)");
1446     }
1447     if (STR_ASSOC_P(str)) {
1448         rb_str_modify(str);
1449         if (STR_EMBED_P(str)) str_make_independent(str);
1450         REALLOC_N(RSTRING(str)->as.heap.ptr, char, RSTRING(str)->as.heap.len+len+1);
1451         memcpy(RSTRING(str)->as.heap.ptr + RSTRING(str)->as.heap.len, ptr, len);
1452         RSTRING(str)->as.heap.len += len;
1453         RSTRING(str)->as.heap.ptr[RSTRING(str)->as.heap.len] = '\0'; /* sentinel */
1454         return str;
1455     }
1456
1457     return rb_str_buf_cat(str, ptr, len);
1458 }
1459
1460 VALUE
1461 rb_str_cat2(VALUE str, const char *ptr)
1462 {
1463     return rb_str_cat(str, ptr, strlen(ptr));
1464 }
1465
1466 static VALUE
1467 rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
1468     int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
1469 {
1470     long capa, total, off = -1;
1471
1472     int str_encindex = ENCODING_GET(str);
1473     int res_encindex;
1474     int str_cr, res_cr;
1475     int str_a8 = ENCODING_IS_ASCII8BIT(str);
1476     int ptr_a8 = ptr_encindex == 0;
1477
1478     str_cr = ENC_CODERANGE(str);
1479
1480     if (str_encindex == ptr_encindex) {
1481         if (str_cr == ENC_CODERANGE_UNKNOWN ||
1482             (ptr_a8 && str_cr != ENC_CODERANGE_7BIT)) {
1483             ptr_cr = ENC_CODERANGE_UNKNOWN;
1484         }
1485         else if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
1486             ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
1487         }
1488     }
1489     else {
1490         rb_encoding *str_enc = rb_enc_from_index(str_encindex);
1491         rb_encoding *ptr_enc = rb_enc_from_index(ptr_encindex);
1492         if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
1493             if (len == 0)
1494                 return str;
1495             if (RSTRING_LEN(str) == 0) {
1496                 rb_str_buf_cat(str, ptr, len);
1497                 ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
1498                 return str;
1499             }
1500             goto incompatible;
1501         }
1502         if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
1503             ptr_cr = coderange_scan(ptr, len, ptr_enc);
1504         }
1505         if (str_cr == ENC_CODERANGE_UNKNOWN) {
1506             if (str_a8 || ptr_cr != ENC_CODERANGE_7BIT) {
1507                 str_cr = rb_enc_str_coderange(str);
1508             }
1509         }
1510     }
1511     if (ptr_cr_ret)
1512         *ptr_cr_ret = ptr_cr;
1513
1514     if (str_encindex != ptr_encindex &&
1515         str_cr != ENC_CODERANGE_7BIT &&
1516         ptr_cr != ENC_CODERANGE_7BIT) {
1517       incompatible:
1518         rb_raise(rb_eArgError, "append incompatible encoding strings: %s and %s",
1519             rb_enc_name(rb_enc_from_index(str_encindex)),
1520             rb_enc_name(rb_enc_from_index(ptr_encindex)));
1521     }
1522
1523     if (str_cr == ENC_CODERANGE_UNKNOWN) {
1524         res_encindex = str_encindex;
1525         res_cr = ENC_CODERANGE_UNKNOWN;
1526     }
1527     else if (str_cr == ENC_CODERANGE_7BIT) {
1528         if (ptr_cr == ENC_CODERANGE_7BIT) {
1529             res_encindex = !str_a8 ? str_encindex : ptr_encindex;
1530             res_cr = ENC_CODERANGE_7BIT;
1531         }
1532         else {
1533             res_encindex = ptr_encindex;
1534             res_cr = ptr_cr;
1535         }
1536     }
1537     else if (str_cr == ENC_CODERANGE_VALID) {
1538         res_encindex = str_encindex;
1539         res_cr = str_cr;
1540     }
1541     else { /* str_cr == ENC_CODERANGE_BROKEN */
1542         res_encindex = str_encindex;
1543         res_cr = str_cr;
1544         if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
1545     }
1546
1547     if (len < 0) {
1548         rb_raise(rb_eArgError, "negative string size (or size too big)");
1549     }
1550     if (ptr >= RSTRING_PTR(str) && ptr <= RSTRING_END(str)) {
1551         off = ptr - RSTRING_PTR(str);
1552     }
1553     rb_str_modify(str);
1554     if (len == 0) {
1555         ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
1556         return str;
1557     }
1558     if (STR_ASSOC_P(str)) {
1559         FL_UNSET(str, STR_ASSOC);
1560         capa = RSTRING(str)->as.heap.aux.capa = RSTRING_LEN(str);
1561     }
1562     else if (STR_EMBED_P(str)) {
1563         capa = RSTRING_EMBED_LEN_MAX;
1564     }
1565     else {
1566         capa = RSTRING(str)->as.heap.aux.capa;
1567     }
1568     total = RSTRING_LEN(str)+len;
1569     if (capa <= total) {
1570         while (total > capa) {
1571             capa = (capa + 1) * 2;
1572         }
1573         RESIZE_CAPA(str, capa);
1574     }
1575     if (off != -1) {
1576         ptr = RSTRING_PTR(str) + off;
1577     }
1578     memcpy(RSTRING_PTR(str) + RSTRING_LEN(str), ptr, len);
1579     STR_SET_LEN(str, total);
1580     RSTRING_PTR(str)[total] = '\0'; /* sentinel */
1581
1582     ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
1583     return str;
1584 }
1585
1586 VALUE
1587 rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
1588 {
1589     return rb_enc_cr_str_buf_cat(str, ptr, len,
1590         rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL);
1591 }
1592
1593 VALUE
1594 rb_str_buf_cat_ascii(VALUE str, const char *ptr)
1595 {
1596     /* ptr must reference NUL terminated ASCII string. */
1597     int encindex = ENCODING_GET(str);
1598     rb_encoding *enc = rb_enc_from_index(encindex);
1599     if (rb_enc_asciicompat(enc)) {
1600         return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
1601             encindex, ENC_CODERANGE_7BIT, 0);
1602     }
1603     else {
1604         char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
1605         while (*ptr) {
1606             int c = (unsigned char)*ptr;
1607             int len = rb_enc_codelen(c, enc);
1608             rb_enc_mbcput(c, buf, enc);
1609             rb_enc_cr_str_buf_cat(str, buf, len,
1610                 encindex, ENC_CODERANGE_VALID, 0);
1611             ptr++;
1612         }
1613         return str;
1614     }
1615 }
1616
1617 VALUE
1618 rb_str_buf_append(VALUE str, VALUE str2)
1619 {
1620     int str2_cr;
1621
1622     str2_cr = ENC_CODERANGE(str2);
1623
1624     rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
1625         ENCODING_GET(str2), str2_cr, &str2_cr);
1626
1627     OBJ_INFECT(str, str2);
1628     ENC_CODERANGE_SET(str2, str2_cr);
1629
1630     return str;
1631 }
1632
1633 VALUE
1634 rb_str_append(VALUE str, VALUE str2)
1635 {
1636     rb_encoding *enc;
1637     int cr, cr2;
1638
1639     StringValue(str2);
1640     if (RSTRING_LEN(str2) > 0 && STR_ASSOC_P(str)) {
1641         long len = RSTRING_LEN(str)+RSTRING_LEN(str2);
1642         enc = rb_enc_check(str, str2);
1643         cr = ENC_CODERANGE(str);
1644         if ((cr2 = ENC_CODERANGE(str2)) > cr) cr = cr2;
1645         rb_str_modify(str);
1646         REALLOC_N(RSTRING(str)->as.heap.ptr, char, len+1);
1647         memcpy(RSTRING(str)->as.heap.ptr + RSTRING(str)->as.heap.len,
1648                RSTRING_PTR(str2), RSTRING_LEN(str2)+1);
1649         RSTRING(str)->as.heap.len = len;
1650         rb_enc_associate(str, enc);
1651         ENC_CODERANGE_SET(str, cr);
1652         OBJ_INFECT(str, str2);
1653         return str;
1654     }
1655     return rb_str_buf_append(str, str2);
1656 }
1657
1658
1659 /*
1660  *  call-seq:
1661  *     str << fixnum        => str
1662  *     str.concat(fixnum)   => str
1663  *     str << obj           => str
1664  *     str.concat(obj)      => str
1665  *
1666  *  Append---Concatenates the given object to <i>str</i>. If the object is a
1667  *  <code>Fixnum</code>, it is considered as a codepoint, and is converted
1668  *  to a character before concatenation.
1669  *
1670  *     a = "hello "
1671  *     a << "world"   #=> "hello world"
1672  *     a.concat(33)   #=> "hello world!"
1673  */
1674
1675 VALUE
1676 rb_str_concat(VALUE str1, VALUE str2)
1677 {
1678     if (FIXNUM_P(str2)) {
1679         rb_encoding *enc = STR_ENC_GET(str1);
1680         int c = FIX2INT(str2);
1681         int pos = RSTRING_LEN(str1);
1682         int len = rb_enc_codelen(c, enc);
1683         int cr = ENC_CODERANGE(str1);
1684
1685         rb_str_resize(str1, pos+len);
1686         rb_enc_mbcput(c, RSTRING_PTR(str1)+pos, enc);
1687         ENC_CODERANGE_SET(str1, cr);
1688         return str1;
1689     }
1690     return rb_str_append(str1, str2);
1691 }
1692
1693 #if defined __i386__ || defined _M_IX86
1694 #define UNALIGNED_WORD_ACCESS 1
1695 #endif
1696 #ifndef UNALIGNED_WORD_ACCESS
1697 #define UNALIGNED_WORD_ACCESS 0
1698 #endif
1699
1700 /* MurmurHash described in http://murmurhash.googlepages.com/ */
1701 static unsigned int
1702 hash(const unsigned char * data, int len, unsigned int h)
1703 {
1704     const unsigned int m = 0x7fd652ad;
1705     const int r = 16;
1706
1707     h += 0xdeadbeef;
1708
1709     if (len >= 4) {
1710 #if !UNALIGNED_WORD_ACCESS
1711         int align = (VALUE)data & 3;
1712         if (align) {
1713             uint32_t t = 0, d = 0;
1714             int sl, sr, pack;
1715
1716             switch (align) {
1717 #ifdef WORDS_BIGENDIAN
1718               case 1: t |= data[2];
1719               case 2: t |= data[1] << 8;
1720               case 3: t |= data[0] << 16;
1721 #else
1722               case 1: t |= data[2] << 16;
1723               case 2: t |= data[1] << 8;
1724               case 3: t |= data[0];
1725 #endif
1726             }
1727
1728 #ifdef WORDS_BIGENDIAN
1729             t >>= (8 * align) - 8;
1730 #else
1731             t <<= (8 * align);
1732 #endif
1733
1734             data += 4-align;
1735             len -= 4-align;
1736
1737             sl = 8 * (4-align);
1738             sr = 8 * align;
1739
1740             while (len >= 4) {
1741                 d = *(uint32_t *)data;
1742 #ifdef WORDS_BIGENDIAN
1743                 t = (t << sr) | (d >> sl);
1744 #else
1745                 t = (t >> sr) | (d << sl);
1746 #endif
1747                 h += t;
1748                 h *= m;
1749                 h ^= h >> r;
1750                 t = d;
1751
1752                 data += 4;
1753                 len -= 4;
1754             }
1755
1756             pack = len < align ? len : align;
1757             d = 0;
1758             switch (pack) {
1759 #ifdef WORDS_BIGENDIAN
1760               case 3: d |= data[2] << 8;
1761               case 2: d |= data[1] << 16;
1762               case 1: d |= data[0] << 24;
1763               case 0:
1764                 h += (t << sr) | (d >> sl);
1765 #else
1766               case 3: d |= data[2] << 16;
1767               case 2: d |= data[1] << 8;
1768               case 1: d |= data[0];
1769               case 0:
1770                 h += (t >> sr) | (d << sl);
1771 #endif
1772                 h *= m;
1773                 h ^= h >> r;
1774             }
1775
1776             data += pack;
1777             len -= pack;
1778         }
1779         else
1780 #endif
1781         {
1782             do {
1783                 h += *(uint32_t *)data;
1784                 h *= m;
1785                 h ^= h >> r;
1786
1787                 data += 4;
1788                 len -= 4;
1789             } while (len >= 4);
1790         }
1791     }
1792
1793     switch(len) {
1794 #ifdef WORDS_BIGENDIAN
1795       case 3:
1796         h += data[2] << 8;
1797       case 2:
1798         h += data[1] << 16;
1799       case 1:
1800         h += data[0] << 24;
1801 #else
1802       case 3:
1803         h += data[2] << 16;
1804       case 2:
1805         h += data[1] << 8;
1806       case 1:
1807         h += data[0];
1808 #endif
1809         h *= m;
1810         h ^= h >> r;
1811     }
1812
1813     h *= m;
1814     h ^= h >> 10;
1815     h *= m;
1816     h ^= h >> 17;
1817
1818     return h;
1819 }
1820
1821 int
1822 rb_memhash(const void *ptr, long len)
1823 {
1824     return hash(ptr, len, 0);
1825 }
1826
1827 int
1828 rb_str_hash(VALUE str)
1829 {
1830     return hash((const void *)RSTRING_PTR(str), RSTRING_LEN(str), 0);
1831 }
1832
1833 int
1834 rb_str_hash_cmp(VALUE str1, VALUE str2)
1835 {
1836     int len;
1837
1838     if (!rb_str_comparable(str1, str2)) return 1;
1839     if (RSTRING_LEN(str1) == (len = RSTRING_LEN(str2)) &&
1840         memcmp(RSTRING_PTR(str1), RSTRING_PTR(str2), len) == 0) {
1841         return 0;
1842     }
1843     return 1;
1844 }
1845
1846 /*
1847  * call-seq:
1848  *    str.hash   => fixnum
1849  *
1850  * Return a hash based on the string's length and content.
1851  */
1852
1853 static VALUE
1854 rb_str_hash_m(VALUE str)
1855 {
1856     int hval = rb_str_hash(str);
1857     return INT2FIX(hval);
1858 }
1859
1860 #define lesser(a,b) (((a)>(b))?(b):(a))
1861
1862 int
1863 rb_str_comparable(VALUE str1, VALUE str2)
1864 {
1865     int idx1, idx2;
1866     int rc1, rc2;
1867
1868     if (RSTRING_LEN(str1) == 0) return Qtrue;
1869     if (RSTRING_LEN(str2) == 0) return Qtrue;
1870     idx1 = ENCODING_GET(str1);
1871     idx2 = ENCODING_GET(str2);
1872     if (idx1 == idx2) return Qtrue;
1873     rc1 = rb_enc_str_coderange(str1);
1874     rc2 = rb_enc_str_coderange(str2);
1875     if (rc1 == ENC_CODERANGE_7BIT) {
1876         if (rc2 == ENC_CODERANGE_7BIT) return Qtrue;
1877         if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
1878             return Qtrue;
1879     }
1880     if (rc2 == ENC_CODERANGE_7BIT) {
1881         if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
1882             return Qtrue;
1883     }
1884     return Qfalse;
1885 }
1886
1887 int
1888 rb_str_cmp(VALUE str1, VALUE str2)
1889 {
1890     long len;
1891     int retval;
1892
1893     len = lesser(RSTRING_LEN(str1), RSTRING_LEN(str2));
1894     retval = memcmp(RSTRING_PTR(str1), RSTRING_PTR(str2), len);
1895     if (retval == 0) {
1896         if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) {
1897             if (!rb_enc_compatible(str1, str2)) {
1898                 if (ENCODING_GET(str1) - ENCODING_GET(str2) > 0)
1899                     return 1;
1900                 return -1;
1901             }
1902             return 0;
1903         }
1904         if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return 1;
1905         return -1;
1906     }
1907     if (retval > 0) return 1;
1908     return -1;
1909 }
1910
1911
1912 /*
1913  *  call-seq:
1914  *     str == obj   => true or false
1915  *
1916  *  Equality---If <i>obj</i> is not a <code>String</code>, returns
1917  *  <code>false</code>. Otherwise, returns <code>true</code> if <i>str</i>
1918  *  <code><=></code> <i>obj</i> returns zero.
1919  */
1920
1921 VALUE
1922 rb_str_equal(VALUE str1, VALUE str2)
1923 {
1924     int len;
1925
1926     if (str1 == str2) return Qtrue;
1927     if (TYPE(str2) != T_STRING) {
1928         if (!rb_respond_to(str2, rb_intern("to_str"))) {
1929             return Qfalse;
1930         }
1931         return rb_equal(str2, str1);
1932     }
1933     if (!rb_str_comparable(str1, str2)) return Qfalse;
1934     if (RSTRING_LEN(str1) == (len = RSTRING_LEN(str2)) &&
1935         memcmp(RSTRING_PTR(str1), RSTRING_PTR(str2), len) == 0) {
1936         return Qtrue;
1937     }
1938     return Qfalse;
1939 }
1940
1941 /*
1942  * call-seq:
1943  *   str.eql?(other)   => true or false
1944  *
1945  * Two strings are equal if the have the same length and content.
1946  */
1947
1948 static VALUE
1949 rb_str_eql(VALUE str1, VALUE str2)
1950 {
1951     if (TYPE(str2) != T_STRING || RSTRING_LEN(str1) != RSTRING_LEN(str2))
1952         return Qfalse;
1953
1954     if (!rb_str_comparable(str1, str2)) return Qfalse;
1955     if (memcmp(RSTRING_PTR(str1), RSTRING_PTR(str2),
1956                lesser(RSTRING_LEN(str1), RSTRING_LEN(str2))) == 0)
1957         return Qtrue;
1958
1959     return Qfalse;
1960 }
1961
1962 /*
1963  *  call-seq:
1964  *     str <=> other_str   => -1, 0, +1
1965  *
1966  *  Comparison---Returns -1 if <i>other_str</i> is less than, 0 if
1967  *  <i>other_str</i> is equal to, and +1 if <i>other_str</i> is greater than
1968  *  <i>str</i>. If the strings are of different lengths, and the strings are
1969  *  equal when compared up to the shortest length, then the longer string is
1970  *  considered greater than the shorter one. In older versions of Ruby, setting
1971  *  <code>$=</code> allowed case-insensitive comparisons; this is now deprecated
1972  *  in favor of using <code>String#casecmp</code>.
1973  *
1974  *  <code><=></code> is the basis for the methods <code><</code>,
1975  *  <code><=</code>, <code>></code>, <code>>=</code>, and <code>between?</code>,
1976  *  included from module <code>Comparable</code>.  The method
1977  *  <code>String#==</code> does not use <code>Comparable#==</code>.
1978  *
1979  *     "abcdef" <=> "abcde"     #=> 1
1980  *     "abcdef" <=> "abcdef"    #=> 0
1981  *     "abcdef" <=> "abcdefg"   #=> -1
1982  *     "abcdef" <=> "ABCDEF"    #=> 1
1983  */
1984
1985 static VALUE
1986 rb_str_cmp_m(VALUE str1, VALUE str2)
1987 {
1988     long result;
1989
1990     if (TYPE(str2) != T_STRING) {
1991         if (!rb_respond_to(str2, rb_intern("to_str"))) {
1992             return Qnil;
1993         }
1994         else if (!rb_respond_to(str2, rb_intern("<=>"))) {
1995             return Qnil;
1996         }
1997         else {
1998             VALUE tmp = rb_funcall(str2, rb_intern("<=>"), 1, str1);
1999
2000             if (NIL_P(tmp)) return Qnil;
2001             if (!FIXNUM_P(tmp)) {
2002                 return rb_funcall(LONG2FIX(0), '-', 1, tmp);
2003             }
2004             result = -FIX2LONG(tmp);
2005         }
2006     }
2007     else {
2008         result = rb_str_cmp(str1, str2);
2009     }
2010     return LONG2NUM(result);
2011 }
2012
2013 /*
2014  *  call-seq:
2015  *     str.casecmp(other_str)   => -1, 0, +1
2016  *
2017  *  Case-insensitive version of <code>String#<=></code>.
2018  *
2019  *     "abcdef".casecmp("abcde")     #=> 1
2020  *     "aBcDeF".casecmp("abcdef")    #=> 0
2021  *     "abcdef".casecmp("abcdefg")   #=> -1
2022  *     "abcdef".casecmp("ABCDEF")    #=> 0
2023  */
2024
2025 static VALUE
2026 rb_str_casecmp(VALUE str1, VALUE str2)
2027 {
2028     long len;
2029     rb_encoding *enc;
2030     char *p1, *p1end, *p2, *p2end;
2031
2032     StringValue(str2);
2033     enc = rb_enc_compatible(str1, str2);
2034     if (!enc) {
2035         return Qnil;
2036     }
2037
2038     p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
2039     p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
2040     while (p1 < p1end && p2 < p2end) {
2041         int c1 = rb_enc_codepoint(p1, p1end, enc);
2042         int c2 = rb_enc_codepoint(p2, p2end, enc);
2043
2044         if (c1 != c2) {
2045             c1 = rb_enc_toupper(c1, enc);
2046             c2 = rb_enc_toupper(c2, enc);
2047             if (c1 > c2) return INT2FIX(1);
2048             if (c1 < c2) return INT2FIX(-1);
2049         }
2050         len = rb_enc_codelen(c1, enc);
2051         p1 += len;
2052         p2 += len;
2053     }
2054     if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0);
2055     if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1);
2056     return INT2FIX(-1);
2057 }
2058
2059 static long
2060 rb_str_index(VALUE str, VALUE sub, long offset)
2061 {
2062     long pos;
2063     char *s, *sptr;
2064     long len, slen;
2065     rb_encoding *enc;
2066
2067     enc = rb_enc_check(str, sub);
2068     if (is_broken_string(sub)) {
2069         return -1;
2070     }
2071     len = str_strlen(str, enc);
2072     slen = str_strlen(sub, enc);
2073     if (offset < 0) {
2074         offset += len;
2075         if (offset < 0) return -1;
2076     }
2077     if (len - offset < slen) return -1;
2078     s = RSTRING_PTR(str);
2079     if (offset) {
2080         offset = str_offset(s, RSTRING_END(str), offset, enc, single_byte_optimizable(str));
2081         s += offset;
2082     }
2083     if (slen == 0) return offset;
2084     /* need proceed one character at a time */
2085     sptr = RSTRING_PTR(sub);
2086     slen = RSTRING_LEN(sub);
2087     len = RSTRING_LEN(str) - offset;
2088     for (;;) {
2089         char *t;
2090         pos = rb_memsearch(sptr, slen, s, len, enc);
2091         if (pos < 0) return pos;
2092         t = rb_enc_right_char_head(s, s+pos, enc);
2093         if (t == s + pos) break;
2094         if ((len -= t - s) <= 0) return -1;
2095         offset += t - s;
2096         s = t;
2097     }
2098     return pos + offset;
2099 }
2100
2101
2102 /*
2103  *  call-seq:
2104  *     str.index(substring [, offset])   => fixnum or nil
2105  *     str.index(fixnum [, offset])      => fixnum or nil
2106  *     str.index(regexp [, offset])      => fixnum or nil
2107  *
2108  *  Returns the index of the first occurrence of the given <i>substring</i>,
2109  *  character (<i>fixnum</i>), or pattern (<i>regexp</i>) in <i>str</i>. Returns
2110  *  <code>nil</code> if not found. If the second parameter is present, it
2111  *  specifies the position in the string to begin the search.
2112  *
2113  *     "hello".index('e')             #=> 1
2114  *     "hello".index('lo')            #=> 3
2115  *     "hello".index('a')             #=> nil
2116  *     "hello".index(?e)              #=> 1
2117  *     "hello".index(101)             #=> 1
2118  *     "hello".index(/[aeiou]/, -3)   #=> 4
2119  */
2120
2121 static VALUE
2122 rb_str_index_m(int argc, VALUE *argv, VALUE str)
2123 {
2124     VALUE sub;
2125     VALUE initpos;
2126     long pos;
2127
2128     if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
2129         pos = NUM2LONG(initpos);
2130     }
2131     else {
2132         pos = 0;
2133     }
2134     if (pos < 0) {
2135         pos += str_strlen(str, STR_ENC_GET(str));
2136         if (pos < 0) {
2137             if (TYPE(sub) == T_REGEXP) {
2138                 rb_backref_set(Qnil);
2139             }
2140             return Qnil;
2141         }
2142     }
2143
2144     switch (TYPE(sub)) {
2145       case T_REGEXP:
2146         pos = rb_reg_adjust_startpos(sub, str, pos, 0);
2147         pos = rb_reg_search(sub, str, pos, 0);
2148         pos = rb_str_sublen(str, pos);
2149         break;
2150
2151       default: {
2152         VALUE tmp;
2153
2154         tmp = rb_check_string_type(sub);
2155         if (NIL_P(tmp)) {
2156             rb_raise(rb_eTypeError, "type mismatch: %s given",
2157                      rb_obj_classname(sub));
2158         }
2159         sub = tmp;
2160       }
2161         /* fall through */
2162       case T_STRING:
2163         pos = rb_str_index(str, sub, pos);
2164         pos = rb_str_sublen(str, pos);
2165         break;
2166     }
2167
2168     if (pos == -1) return Qnil;
2169     return LONG2NUM(pos);
2170 }
2171
2172 static long
2173 rb_str_rindex(VALUE str, VALUE sub, long pos)
2174 {
2175     long len, slen;
2176     char *s, *sbeg, *e, *t;
2177     rb_encoding *enc;
2178     int singlebyte = single_byte_optimizable(str);
2179
2180     enc = rb_enc_check(str, sub);
2181     if (is_broken_string(sub)) {
2182         return -1;
2183     }
2184     len = str_strlen(str, enc);
2185     slen = str_strlen(sub, enc);
2186     /* substring longer than string */
2187     if (len < slen) return -1;
2188     if (len - pos < slen) {
2189         pos = len - slen;
2190     }
2191     if (len == 0) {
2192         return pos;
2193     }
2194     sbeg = RSTRING_PTR(str);
2195     e = RSTRING_END(str);
2196     t = RSTRING_PTR(sub);
2197     slen = RSTRING_LEN(sub);
2198     for (;;) {
2199         s = str_nth(sbeg, e, pos, enc, singlebyte);
2200         if (!s) return -1;
2201         if (memcmp(s, t, slen) == 0) {
2202             return pos;
2203         }
2204         if (pos == 0) break;
2205         pos--;
2206     }
2207     return -1;
2208 }
2209
2210
2211 /*
2212  *  call-seq:
2213  *     str.rindex(substring [, fixnum])   => fixnum or nil
2214  *     str.rindex(fixnum [, fixnum])   => fixnum or nil
2215  *     str.rindex(regexp [, fixnum])   => fixnum or nil
2216  *
2217  *  Returns the index of the last occurrence of the given <i>substring</i>,
2218  *  character (<i>fixnum</i>), or pattern (<i>regexp</i>) in <i>str</i>. Returns
2219  *  <code>nil</code> if not found. If the second parameter is present, it
2220  *  specifies the position in the string to end the search---characters beyond
2221  *  this point will not be considered.
2222  *
2223  *     "hello".rindex('e')             #=> 1
2224  *     "hello".rindex('l')             #=> 3
2225  *     "hello".rindex('a')             #=> nil
2226  *     "hello".rindex(?e)              #=> 1
2227  *     "hello".rindex(101)             #=> 1
2228  *     "hello".rindex(/[aeiou]/, -2)   #=> 1
2229  */
2230
2231 static VALUE
2232 rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
2233 {
2234     VALUE sub;
2235     VALUE vpos;
2236     rb_encoding *enc = STR_ENC_GET(str);
2237     long pos, len = str_strlen(str, enc);
2238
2239     if (rb_scan_args(argc, argv, "11", &sub, &vpos) == 2) {
2240         pos = NUM2LONG(vpos);
2241         if (pos < 0) {
2242             pos += len;
2243             if (pos < 0) {
2244                 if (TYPE(sub) == T_REGEXP) {
2245                     rb_backref_set(Qnil);
2246                 }
2247                 return Qnil;
2248             }
2249         }
2250         if (pos > len) pos = len;
2251     }
2252     else {
2253         pos = len;
2254     }
2255
2256     switch (TYPE(sub)) {
2257       case T_REGEXP:
2258         /* enc = rb_get_check(str, sub); */
2259         if (RREGEXP(sub)->len) {
2260             pos = rb_reg_adjust_startpos(sub, str, pos, 1);
2261             pos = rb_reg_search(sub, str, pos, 1);
2262             pos = rb_str_sublen(str, pos);
2263         }
2264         if (pos >= 0) return LONG2NUM(pos);
2265         break;
2266
2267       default: {
2268         VALUE tmp;
2269
2270         tmp = rb_check_string_type(sub);
2271         if (NIL_P(tmp)) {
2272             rb_raise(rb_eTypeError, "type mismatch: %s given",
2273                      rb_obj_classname(sub));
2274         }
2275         sub = tmp;
2276       }
2277         /* fall through */
2278       case T_STRING:
2279         pos = rb_str_rindex(str, sub, pos);
2280         if (pos >= 0) return LONG2NUM(pos);
2281         break;
2282     }
2283     return Qnil;
2284 }
2285
2286 /*
2287  *  call-seq:
2288  *     str =~ obj   => fixnum or nil
2289  *
2290  *  Match---If <i>obj</i> is a <code>Regexp</code>, use it as a pattern to match
2291  *  against <i>str</i>,and returns the position the match starts, or
2292  *  <code>nil</code> if there is no match. Otherwise, invokes
2293  *  <i>obj.=~</i>, passing <i>str</i> as an argument. The default
2294  *  <code>=~</code> in <code>Object</code> returns <code>false</code>.
2295  *
2296  *     "cat o' 9 tails" =~ /\d/   #=> 7
2297  *     "cat o' 9 tails" =~ 9      #=> nil
2298  */
2299
2300 static VALUE
2301 rb_str_match(VALUE x, VALUE y)
2302 {
2303     switch (TYPE(y)) {
2304       case T_STRING:
2305         rb_raise(rb_eTypeError, "type mismatch: String given");
2306
2307       case T_REGEXP:
2308         return rb_reg_match(y, x);
2309
2310       default:
2311         return rb_funcall(y, rb_intern("=~"), 1, x);
2312     }
2313 }
2314
2315
2316 static VALUE get_pat(VALUE, int);
2317
2318
2319 /*
2320  *  call-seq:
2321  *     str.match(pattern)   => matchdata or nil
2322  *
2323  *  Converts <i>pattern</i> to a <code>Regexp</code> (if it isn't already one),
2324  *  then invokes its <code>match</code> method on <i>str</i>.  If the second
2325  *  parameter is present, it specifies the position in the string to begin the
2326  *  search.
2327  *
2328  *     'hello'.match('(.)\1')      #=> #<MatchData "ll" 1:"l">
2329  *     'hello'.match('(.)\1')[0]   #=> "ll"
2330  *     'hello'.match(/(.)\1/)[0]   #=> "ll"
2331  *     'hello'.match('xx')         #=> nil
2332  *
2333  *  If a block is given, invoke the block with MatchData if match succeed, so
2334  *  that you can write
2335  *
2336  *     str.match(pat) {|m| ...}
2337  *
2338  *  instead of
2339  *
2340  *     if m = str.match(pat)
2341  *       ...
2342  *     end
2343  *
2344  *  The return value is a value from block execution in this case.
2345  */
2346
2347 static VALUE
2348 rb_str_match_m(int argc, VALUE *argv, VALUE str)
2349 {
2350     VALUE re, result;
2351     if (argc < 1)
2352         rb_raise(rb_eArgError, "wrong number of arguments (%d for 1)", argc);
2353     re = argv[0];
2354     argv[0] = str;
2355     result = rb_funcall2(get_pat(re, 0), rb_intern("match"), argc, argv);
2356     if (!NIL_P(result) && rb_block_given_p()) {
2357         return rb_yield(result);
2358     }
2359     return result;
2360 }
2361
2362 enum neighbor_char {
2363     NEIGHBOR_NOT_CHAR,
2364     NEIGHBOR_FOUND,
2365     NEIGHBOR_WRAPPED
2366 };
2367
2368 static enum neighbor_char
2369 enc_succ_char(char *p, int len, rb_encoding *enc)
2370 {
2371     int i, l;
2372     while (1) {
2373         for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
2374             p[i] = '\0';
2375         if (i < 0)
2376             return NEIGHBOR_WRAPPED;
2377         ++((unsigned char*)p)[i];
2378         l = rb_enc_precise_mbclen(p, p+len, enc);
2379         if (MBCLEN_CHARFOUND_P(l)) {
2380             l = MBCLEN_CHARFOUND_LEN(l);
2381             if (l == len) {
2382                 return NEIGHBOR_FOUND;
2383             }
2384             else {
2385                 memset(p+l, 0xff, len-l);
2386             }
2387         }
2388         if (MBCLEN_INVALID_P(l) && i < len-1) {
2389             int len2, l2;
2390             for (len2 = len-1; 0 < len2; len2--) {
2391                 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
2392                 if (!MBCLEN_INVALID_P(l2))
2393                     break;
2394             }
2395             memset(p+len2+1, 0xff, len-(len2+1));
2396         }
2397     }
2398 }
2399
2400 static enum neighbor_char
2401 enc_pred_char(char *p, int len, rb_encoding *enc)
2402 {
2403     int i, l;
2404     while (1) {
2405         for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
2406             p[i] = '\xff';
2407         if (i < 0)
2408             return NEIGHBOR_WRAPPED;
2409         --((unsigned char*)p)[i];
2410         l = rb_enc_precise_mbclen(p, p+len, enc);
2411         if (MBCLEN_CHARFOUND_P(l)) {
2412             l = MBCLEN_CHARFOUND_LEN(l);
2413             if (l == len) {
2414                 return NEIGHBOR_FOUND;
2415             }
2416             else {
2417                 memset(p+l, 0, len-l);
2418             }
2419         }
2420         if (MBCLEN_INVALID_P(l) && i < len-1) {
2421             int len2, l2;
2422             for (len2 = len-1; 0 < len2; len2--) {
2423                 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
2424                 if (!MBCLEN_INVALID_P(l2))
2425                     break;
2426             }
2427             memset(p+len2+1, 0, len-(len2+1));
2428         }
2429     }
2430 }
2431
2432 /*
2433   overwrite +p+ by succeeding letter in +enc+ and returns
2434   NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
2435   When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
2436   assuming each ranges are successive, and mbclen
2437   never change in each ranges.
2438   NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
2439   character.
2440  */
2441 static enum neighbor_char
2442 enc_succ_alnum_char(char *p, int len, rb_encoding *enc, char *carry)
2443 {
2444     enum neighbor_char ret;
2445     int c;
2446     int ctype;
2447     int range;
2448     char save[ONIGENC_CODE_TO_MBC_MAXLEN];
2449
2450     c = rb_enc_mbc_to_codepoint(p, p+len, enc);
2451     if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
2452         ctype = ONIGENC_CTYPE_DIGIT;
2453     else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
2454         ctype = ONIGENC_CTYPE_ALPHA;
2455     else
2456         return NEIGHBOR_NOT_CHAR;
2457
2458     MEMCPY(save, p, char, len);
2459     ret = enc_succ_char(p, len, enc);
2460     if (ret == NEIGHBOR_FOUND) {
2461         c = rb_enc_mbc_to_codepoint(p, p+len, enc);
2462         if (rb_enc_isctype(c, ctype, enc))
2463             return NEIGHBOR_FOUND;
2464     }
2465     MEMCPY(p, save, char, len);
2466     range = 1;
2467     while (1) {
2468         MEMCPY(save, p, char, len);
2469         ret = enc_pred_char(p, len, enc);
2470         if (ret == NEIGHBOR_FOUND) {
2471             c = rb_enc_mbc_to_codepoint(p, p+len, enc);
2472             if (!rb_enc_isctype(c, ctype, enc)) {
2473                 MEMCPY(p, save, char, len);
2474                 break;
2475             }
2476         }
2477         else {
2478             MEMCPY(p, save, char, len);
2479             break;
2480         }
2481         range++;
2482     }
2483     if (range == 1) {
2484         return NEIGHBOR_NOT_CHAR;
2485     }
2486
2487     if (ctype != ONIGENC_CTYPE_DIGIT) {
2488         MEMCPY(carry, p, char, len);
2489         return NEIGHBOR_WRAPPED;
2490     }
2491
2492     MEMCPY(carry, p, char, len);
2493     enc_succ_char(carry, len, enc);
2494     return NEIGHBOR_WRAPPED;
2495 }
2496
2497
2498 /*
2499  *  call-seq:
2500  *     str.succ   => new_str
2501  *     str.next   => new_str
2502  *
2503  *  Returns the successor to <i>str</i>. The successor is calculated by
2504  *  incrementing characters starting from the rightmost alphanumeric (or
2505  *  the rightmost character if there are no alphanumerics) in the
2506  *  string. Incrementing a digit always results in another digit, and
2507  *  incrementing a letter results in another letter of the same case.
2508  *  Incrementing nonalphanumerics uses the underlying character set's
2509  *  collating sequence.
2510  *
2511  *  If the increment generates a ``carry,'' the character to the left of
2512  *  it is incremented. This process repeats until there is no carry,
2513  *  adding an additional character if necessary.
2514  *
2515  *     "abcd".succ        #=> "abce"
2516  *     "THX1138".succ     #=> "THX1139"
2517  *     "<<koala>>".succ   #=> "<<koalb>>"
2518  *     "1999zzz".succ     #=> "2000aaa"
2519  *     "ZZZ9999".succ     #=> "AAAA0000"
2520  *     "***".succ         #=> "**+"
2521  */
2522
2523 VALUE
2524 rb_str_succ(VALUE orig)
2525 {
2526     rb_encoding *enc;
2527     VALUE str;
2528     char *sbeg, *s, *e;
2529     int c = -1;
2530     long l;
2531     char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
2532     int carry_pos = 0, carry_len = 1;
2533
2534     str = rb_str_new5(orig, RSTRING_PTR(orig), RSTRING_LEN(orig));
2535     rb_enc_cr_str_copy_for_substr(str, orig);
2536     OBJ_INFECT(str, orig);
2537     if (RSTRING_LEN(str) == 0) return str;
2538
2539     enc = STR_ENC_GET(orig);
2540     sbeg = RSTRING_PTR(str);
2541     s = e = sbeg + RSTRING_LEN(str);
2542
2543     while ((s = rb_enc_prev_char(sbeg, s, enc)) != 0) {
2544         enum neighbor_char neighbor;
2545         if ((l = rb_enc_precise_mbclen(s, e, enc)) <= 0) continue;
2546         neighbor = enc_succ_alnum_char(s, l, enc, carry);
2547         if (neighbor == NEIGHBOR_NOT_CHAR)
2548             continue;
2549         if (neighbor == NEIGHBOR_FOUND)
2550             return str;
2551         c = 1;
2552         carry_pos = s - sbeg;
2553         carry_len = l;
2554     }
2555     if (c == -1) {              /* str contains no alnum */
2556         s = e;
2557         while ((s = rb_enc_prev_char(sbeg, s, enc)) != 0) {
2558             enum neighbor_char neighbor;
2559             if ((l = rb_enc_precise_mbclen(s, e, enc)) <= 0) continue;
2560             neighbor = enc_succ_char(s, l, enc);
2561             if (neighbor == NEIGHBOR_FOUND)
2562                 return str;
2563             if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
2564                 /* wrapped to \0...\0.  search next valid char. */
2565                 enc_succ_char(s, l, enc);
2566             }
2567             if (!rb_enc_asciicompat(enc)) {
2568                 MEMCPY(carry, s, char, l);
2569                 carry_len = l;
2570             }
2571             carry_pos = s - sbeg;
2572         }
2573     }
2574     RESIZE_CAPA(str, RSTRING_LEN(str) + carry_len);
2575     s = RSTRING_PTR(str) + carry_pos;
2576     memmove(s + carry_len, s, RSTRING_LEN(str) - carry_pos);
2577     memmove(s, carry, carry_len);
2578     STR_SET_LEN(str, RSTRING_LEN(str) + carry_len);
2579     RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
2580     rb_enc_str_coderange(str);
2581     return str;
2582 }
2583
2584
2585 /*
2586  *  call-seq:
2587  *     str.succ!   => str
2588  *     str.next!   => str
2589  *
2590  *  Equivalent to <code>String#succ</code>, but modifies the receiver in
2591  *  place.
2592  */
2593
2594 static VALUE
2595 rb_str_succ_bang(VALUE str)
2596 {
2597     rb_str_shared_replace(str, rb_str_succ(str));
2598
2599     return str;
2600 }
2601
2602
2603 /*
2604  *  call-seq:
2605  *     str.upto(other_str, exclusive=false) {|s| block }   => str
2606  *
2607  *  Iterates through successive values, starting at <i>str</i> and
2608  *  ending at <i>other_str</i> inclusive, passing each value in turn to
2609  *  the block. The <code>String#succ</code> method is used to generate
2610  *  each value.  If optional second argument exclusive is omitted or is <code>false</code>,
2611  *  the last value will be included; otherwise it will be excluded.
2612  *
2613  *     "a8".upto("b6") {|s| print s, ' ' }
2614  *     for s in "a8".."b6"
2615  *       print s, ' '
2616  *     end
2617  *
2618  *  <em>produces:</em>
2619  *
2620  *     a8 a9 b0 b1 b2 b3 b4 b5 b6
2621  *     a8 a9 b0 b1 b2 b3 b4 b5 b6
2622  */
2623
2624 static VALUE
2625 rb_str_upto(int argc, VALUE *argv, VALUE beg)
2626 {
2627     VALUE end, exclusive;
2628     VALUE current, after_end;
2629     ID succ;
2630     int n, excl;
2631     rb_encoding *enc;
2632
2633     rb_scan_args(argc, argv, "11", &end, &exclusive);
2634     excl = RTEST(exclusive);
2635     CONST_ID(succ, "succ");
2636     StringValue(end);
2637     enc = rb_enc_check(beg, end);
2638     if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 &&
2639         is_ascii_string(beg) && is_ascii_string(end)) {
2640         char c = RSTRING_PTR(beg)[0];
2641         char e = RSTRING_PTR(end)[0];
2642
2643         if (c > e || (excl && c == e)) return beg;
2644         for (;;) {
2645             rb_yield(rb_enc_str_new(&c, 1, enc));
2646             if (!excl && c == e) break;
2647             c++;
2648             if (excl && c == e) break;
2649         }
2650         return beg;
2651     }
2652     n = rb_str_cmp(beg, end);
2653     if (n > 0 || (excl && n == 0)) return beg;
2654
2655     after_end = rb_funcall(end, succ, 0, 0);
2656     current = beg;
2657     while (!rb_str_equal(current, after_end)) {
2658         rb_yield(current);
2659         if (!excl && rb_str_equal(current, end)) break;
2660         current = rb_funcall(current, succ, 0, 0);
2661         StringValue(current);
2662         if (excl && rb_str_equal(current, end)) break;
2663         if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
2664             break;
2665     }
2666
2667     return beg;
2668 }
2669
2670 static VALUE
2671 rb_str_subpat(VALUE str, VALUE re, int nth)
2672 {
2673     if (rb_reg_search(re, str, 0, 0) >= 0) {
2674         return rb_reg_nth_match(nth, rb_backref_get());
2675     }
2676     return Qnil;
2677 }
2678
2679 static VALUE
2680 rb_str_aref(VALUE str, VALUE indx)
2681 {
2682     long idx;
2683
2684     switch (TYPE(indx)) {
2685       case T_FIXNUM:
2686         idx = FIX2LONG(indx);
2687
2688       num_index:
2689         str = rb_str_substr(str, idx, 1);
2690         if (!NIL_P(str) && RSTRING_LEN(str) == 0) return Qnil;
2691         return str;
2692
2693       case T_REGEXP:
2694         return rb_str_subpat(str, indx, 0);
2695
2696       case T_STRING:
2697         if (rb_str_index(str, indx, 0) != -1)
2698             return rb_str_dup(indx);
2699         return Qnil;
2700
2701       default:
2702         /* check if indx is Range */
2703         {
2704             long beg, len;
2705             VALUE tmp;
2706
2707             len = str_strlen(str, STR_ENC_GET(str));
2708             switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
2709               case Qfalse:
2710                 break;
2711               case Qnil:
2712                 return Qnil;
2713               default:
2714                 tmp = rb_str_substr(str, beg, len);
2715                 return tmp;
2716             }
2717         }
2718         idx = NUM2LONG(indx);
2719         goto num_index;
2720     }
2721     return Qnil;                /* not reached */
2722 }
2723
2724
2725 /*
2726  *  call-seq:
2727  *     str[fixnum]                 => new_str or nil
2728  *     str[fixnum, fixnum]         => new_str or nil
2729  *     str[range]                  => new_str or nil
2730  *     str[regexp]                 => new_str or nil
2731  *     str[regexp, fixnum]         => new_str or nil
2732  *     str[other_str]              => new_str or nil
2733  *     str.slice(fixnum)           => new_str or nil
2734  *     str.slice(fixnum, fixnum)   => new_str or nil
2735  *     str.slice(range)            => new_str or nil
2736  *     str.slice(regexp)           => new_str or nil
2737  *     str.slice(regexp, fixnum)   => new_str or nil
2738  *     str.slice(other_str)        => new_str or nil
2739  *
2740  *  Element Reference---If passed a single <code>Fixnum</code>, returns a
2741  *  substring of one character at that position. If passed two <code>Fixnum</code>
2742  *  objects, returns a substring starting at the offset given by the first, and
2743  *  a length given by the second. If given a range, a substring containing
2744  *  characters at offsets given by the range is returned. In all three cases, if
2745  *  an offset is negative, it is counted from the end of <i>str</i>. Returns
2746  *  <code>nil</code> if the initial offset falls outside the string, the length
2747  *  is negative, or the beginning of the range is greater than the end.
2748  *
2749  *  If a <code>Regexp</code> is supplied, the matching portion of <i>str</i> is
2750  *  returned. If a numeric parameter follows the regular expression, that
2751  *  component of the <code>MatchData</code> is returned instead. If a
2752  *  <code>String</code> is given, that string is returned if it occurs in
2753  *  <i>str</i>. In both cases, <code>nil</code> is returned if there is no
2754  *  match.
2755  *
2756  *     a = "hello there"
2757  *     a[1]                   #=> "e"
2758  *     a[1,3]                 #=> "ell"
2759  *     a[1..3]                #=> "ell"
2760  *     a[-3,2]                #=> "er"
2761  *     a[-4..-2]              #=> "her"
2762  *     a[12..-1]              #=> nil
2763  *     a[-2..-4]              #=> ""
2764  *     a[/[aeiou](.)\1/]      #=> "ell"
2765  *     a[/[aeiou](.)\1/, 0]   #=> "ell"
2766  *     a[/[aeiou](.)\1/, 1]   #=> "l"
2767  *     a[/[aeiou](.)\1/, 2]   #=> nil
2768  *     a["lo"]                #=> "lo"
2769  *     a["bye"]               #=> nil
2770  */
2771
2772 static VALUE
2773 rb_str_aref_m(int argc, VALUE *argv, VALUE str)
2774 {
2775     if (argc == 2) {
2776         if (TYPE(argv[0]) == T_REGEXP) {
2777             return rb_str_subpat(str, argv[0], NUM2INT(argv[1]));
2778         }
2779         return rb_str_substr(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]));
2780     }
2781     if (argc != 1) {
2782         rb_raise(rb_eArgError, "wrong number of arguments (%d for 1)", argc);
2783     }
2784     return rb_str_aref(str, argv[0]);
2785 }
2786
2787 static void
2788 rb_str_splice_0(VALUE str, long beg, long len, VALUE val)
2789 {
2790     rb_str_modify(str);
2791     if (len < RSTRING_LEN(val)) {
2792         /* expand string */
2793         RESIZE_CAPA(str, RSTRING_LEN(str) + RSTRING_LEN(val) - len + 1);
2794     }
2795
2796     if (RSTRING_LEN(val) != len) {
2797         memmove(RSTRING_PTR(str) + beg + RSTRING_LEN(val),
2798                 RSTRING_PTR(str) + beg + len,
2799                 RSTRING_LEN(str) - (beg + len));
2800     }
2801     if (RSTRING_LEN(val) < beg && len < 0) {
2802         MEMZERO(RSTRING_PTR(str) + RSTRING_LEN(str), char, -len);
2803     }
2804     if (RSTRING_LEN(val) > 0) {
2805         memmove(RSTRING_PTR(str)+beg, RSTRING_PTR(val), RSTRING_LEN(val));
2806     }
2807     STR_SET_LEN(str, RSTRING_LEN(str) + RSTRING_LEN(val) - len);
2808     if (RSTRING_PTR(str)) {
2809         RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
2810     }
2811     OBJ_INFECT(str, val);
2812 }
2813
2814 static void
2815 rb_str_splice(VALUE str, long beg, long len, VALUE val)
2816 {
2817     long slen;
2818     char *p, *e;
2819     rb_encoding *enc;
2820     int singlebyte = single_byte_optimizable(str);
2821
2822     if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
2823
2824     StringValue(val);
2825     rb_str_modify(str);
2826     enc = rb_enc_check(str, val);
2827     slen = str_strlen(str, enc);
2828
2829     if (slen < beg) {
2830       out_of_range:
2831         rb_raise(rb_eIndexError, "index %ld out of string", beg);
2832     }
2833     if (beg < 0) {
2834         if (-beg > slen) {
2835             goto out_of_range;
2836         }
2837         beg += slen;
2838     }
2839     if (slen < len || slen < beg + len) {
2840         len = slen - beg;
2841     }
2842     p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
2843     if (!p) p = RSTRING_END(str);
2844     e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
2845     if (!e) e = RSTRING_END(str);
2846     /* error check */
2847     beg = p - RSTRING_PTR(str); /* physical position */
2848     len = e - p;                /* physical length */
2849     rb_str_splice_0(str, beg, len, val);
2850     rb_enc_associate(str, enc);
2851 }
2852
2853 void
2854 rb_str_update(VALUE str, long beg, long len, VALUE val)
2855 {
2856     rb_str_splice(str, beg, len, val);
2857 }
2858
2859 static void
2860 rb_str_subpat_set(VALUE str, VALUE re, int nth, VALUE val)
2861 {
2862     VALUE match;
2863     long start, end, len;
2864     rb_encoding *enc;
2865     struct re_registers *regs;
2866
2867     if (rb_reg_search(re, str, 0, 0) < 0) {
2868         rb_raise(rb_eIndexError, "regexp not matched");
2869     }
2870     match = rb_backref_get();
2871     regs = RMATCH_REGS(match);
2872     if (nth >= regs->num_regs) {
2873       out_of_range:
2874         rb_raise(rb_eIndexError, "index %d out of regexp", nth);
2875     }
2876     if (nth < 0) {
2877         if (-nth >= regs->num_regs) {
2878             goto out_of_range;
2879         }
2880         nth += regs->num_regs;
2881     }
2882
2883     start = BEG(nth);
2884     if (start == -1) {
2885         rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
2886     }
2887     end = END(nth);
2888     len = end - start;
2889     StringValue(val);
2890     enc = rb_enc_check(str, val);
2891     rb_str_splice_0(str, start, len, val);
2892     rb_enc_associate(str, enc);
2893 }
2894
2895 static VALUE
2896 rb_str_aset(VALUE str, VALUE indx, VALUE val)
2897 {
2898     long idx, beg;
2899
2900     switch (TYPE(indx)) {
2901       case T_FIXNUM:
2902         idx = FIX2LONG(indx);
2903       num_index:
2904         rb_str_splice(str, idx, 1, val);
2905         return val;
2906
2907       case T_REGEXP:
2908         rb_str_subpat_set(str, indx, 0, val);
2909         return val;
2910
2911       case T_STRING:
2912         beg = rb_str_index(str, indx, 0);
2913         if (beg < 0) {
2914             rb_raise(rb_eIndexError, "string not matched");
2915         }
2916         beg = rb_str_sublen(str, beg);
2917         rb_str_splice(str, beg, str_strlen(indx, 0), val);
2918         return val;
2919
2920       default:
2921         /* check if indx is Range */
2922         {
2923             long beg, len;
2924             if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, 0), 2)) {
2925                 rb_str_splice(str, beg, len, val);
2926                 return val;
2927             }
2928         }
2929         idx = NUM2LONG(indx);
2930         goto num_index;
2931     }
2932 }
2933
2934 /*
2935  *  call-seq:
2936  *     str[fixnum] = new_str
2937  *     str[fixnum, fixnum] = new_str
2938  *     str[range] = aString
2939  *     str[regexp] = new_str
2940  *     str[regexp, fixnum] = new_str
2941  *     str[other_str] = new_str
2942  *
2943  *  Element Assignment---Replaces some or all of the content of <i>str</i>. The
2944  *  portion of the string affected is determined using the same criteria as
2945  *  <code>String#[]</code>. If the replacement string is not the same length as
2946  *  the text it is replacing, the string will be adjusted accordingly. If the
2947  *  regular expression or string is used as the index doesn't match a position
2948  *  in the string, <code>IndexError</code> is raised. If the regular expression
2949  *  form is used, the optional second <code>Fixnum</code> allows you to specify
2950  *  which portion of the match to replace (effectively using the
2951  *  <code>MatchData</code> indexing rules. The forms that take a
2952  *  <code>Fixnum</code> will raise an <code>IndexError</code> if the value is
2953  *  out of range; the <code>Range</code> form will raise a
2954  *  <code>RangeError</code>, and the <code>Regexp</code> and <code>String</code>
2955  *  forms will silently ignore the assignment.
2956  */
2957
2958 static VALUE
2959 rb_str_aset_m(int argc, VALUE *argv, VALUE str)
2960 {
2961     if (argc == 3) {
2962         if (TYPE(argv[0]) == T_REGEXP) {
2963             rb_str_subpat_set(str, argv[0], NUM2INT(argv[1]), argv[2]);
2964         }
2965         else {
2966             rb_str_splice(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
2967         }
2968         return argv[2];
2969     }
2970     if (argc != 2) {
2971         rb_raise(rb_eArgError, "wrong number of arguments (%d for 2)", argc);
2972     }
2973     return rb_str_aset(str, argv[0], argv[1]);
2974 }
2975
2976 /*
2977  *  call-seq:
2978  *     str.insert(index, other_str)   => str
2979  *
2980  *  Inserts <i>other_str</i> before the character at the given
2981  *  <i>index</i>, modifying <i>str</i>. Negative indices count from the
2982  *  end of the string, and insert <em>after</em> the given character.
2983  *  The intent is insert <i>aString</i> so that it starts at the given
2984  *  <i>index</i>.
2985  *
2986  *     "abcd".insert(0, 'X')    #=> "Xabcd"
2987  *     "abcd".insert(3, 'X')    #=> "abcXd"
2988  *     "abcd".insert(4, 'X')    #=> "abcdX"
2989  *     "abcd".insert(-3, 'X')   #=> "abXcd"
2990  *     "abcd".insert(-1, 'X')   #=> "abcdX"
2991  */
2992
2993 static VALUE
2994 rb_str_insert(VALUE str, VALUE idx, VALUE str2)
2995 {
2996     long pos = NUM2LONG(idx);
2997
2998     if (pos == -1) {
2999         return rb_str_append(str, str2);
3000     }
3001     else if (pos < 0) {
3002         pos++;
3003     }
3004     rb_str_splice(str, pos, 0, str2);
3005     return str;
3006 }
3007
3008
3009 /*
3010  *  call-seq:
3011  *     str.slice!(fixnum)           => fixnum or nil
3012  *     str.slice!(fixnum, fixnum)   => new_str or nil
3013  *     str.slice!(range)            => new_str or nil
3014  *     str.slice!(regexp)           => new_str or nil
3015  *     str.slice!(other_str)        => new_str or nil
3016  *
3017  *  Deletes the specified portion from <i>str</i>, and returns the portion
3018  *  deleted.
3019  *
3020  *     string = "this is a string"
3021  *     string.slice!(2)        #=> "i"
3022  *     string.slice!(3..6)     #=> " is "
3023  *     string.slice!(/s.*t/)   #=> "sa st"
3024  *     string.slice!("r")      #=> "r"
3025  *     string                  #=> "thing"
3026  */
3027
3028 static VALUE
3029 rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
3030 {
3031     VALUE result;
3032     VALUE buf[3];
3033     int i;
3034
3035     if (argc < 1 || 2 < argc) {
3036         rb_raise(rb_eArgError, "wrong number of arguments (%d for 1)", argc);
3037     }
3038     for (i=0; i<argc; i++) {
3039         buf[i] = argv[i];
3040     }
3041     rb_str_modify(str);
3042     buf[i] = rb_str_new(0,0);
3043     result = rb_str_aref_m(argc, buf, str);
3044     if (!NIL_P(result)) {
3045         rb_str_aset_m(argc+1, buf, str);
3046     }
3047     return result;
3048 }
3049
3050 static VALUE
3051 get_pat(VALUE pat, int quote)
3052 {
3053     VALUE val;
3054
3055     switch (TYPE(pat)) {
3056       case T_REGEXP:
3057         return pat;
3058
3059       case T_STRING:
3060         break;
3061
3062       default:
3063         val = rb_check_string_type(pat);
3064         if (NIL_P(val)) {
3065             Check_Type(pat, T_REGEXP);
3066         }
3067         pat = val;
3068     }
3069
3070     if (quote) {
3071         pat = rb_reg_quote(pat);
3072     }
3073
3074     return rb_reg_regcomp(pat);
3075 }
3076
3077
3078 /*
3079  *  call-seq:
3080  *     str.sub!(pattern, replacement)          => str or nil
3081  *     str.sub!(pattern) {|match| block }      => str or nil
3082  *
3083  *  Performs the substitutions of <code>String#sub</code> in place,
3084  *  returning <i>str</i>, or <code>nil</code> if no substitutions were
3085  *  performed.
3086  */
3087
3088 static VALUE
3089 rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
3090 {
3091     VALUE pat, repl, match, hash = Qnil;
3092     struct re_registers *regs;
3093     int iter = 0;
3094     int tainted = 0;
3095     long plen;
3096
3097     if (argc == 1 && rb_block_given_p()) {
3098         iter = 1;
3099     }
3100     else if (argc == 2) {
3101         repl = argv[1];
3102         hash = rb_check_convert_type(argv[1], T_HASH, "Hash", "to_hash");
3103         if (NIL_P(hash)) {
3104             StringValue(repl);
3105         }
3106         if (OBJ_TAINTED(repl)) tainted = 1;
3107     }
3108     else {
3109         rb_raise(rb_eArgError, "wrong number of arguments (%d for 2)", argc);
3110     }
3111
3112     pat = get_pat(argv[0], 1);
3113     if (rb_reg_search(pat, str, 0, 0) >= 0) {
3114         rb_encoding *enc;
3115         int cr = ENC_CODERANGE(str);
3116
3117         match = rb_backref_get();
3118         regs = RMATCH_REGS(match);
3119
3120         if (iter || !NIL_P(hash)) {
3121             char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
3122
3123             if (iter) {
3124                 rb_match_busy(match);
3125                 repl = rb_obj_as_string(rb_yield(rb_reg_nth_match(0, match)));
3126             }
3127             else {
3128                 repl = rb_hash_aref(hash, rb_str_subseq(str, BEG(0), END(0) - BEG(0)));
3129                 repl = rb_obj_as_string(repl);
3130             }
3131             str_mod_check(str, p, len);
3132             str_frozen_check(str);
3133             if (iter) rb_backref_set(match);
3134         }
3135         else {
3136             repl = rb_reg_regsub(repl, str, regs, pat);
3137         }
3138         enc = rb_enc_compatible(str, repl);
3139         if (!enc) {
3140             rb_encoding *str_enc = STR_ENC_GET(str);
3141             if (coderange_scan(RSTRING_PTR(str), BEG(0), str_enc) != ENC_CODERANGE_7BIT ||
3142                 coderange_scan(RSTRING_PTR(str)+END(0),
3143                                RSTRING_LEN(str)-END(0), str_enc) != ENC_CODERANGE_7BIT) {
3144                 rb_raise(rb_eArgError, "character encodings differ: %s and %s",
3145                          rb_enc_name(str_enc),
3146                          rb_enc_name(STR_ENC_GET(repl)));
3147             }
3148             enc = STR_ENC_GET(repl);
3149         }
3150         rb_str_modify(str);
3151         rb_enc_associate(str, enc);
3152         if (OBJ_TAINTED(repl)) tainted = 1;
3153         if (ENC_CODERANGE_UNKNOWN < cr && cr < ENC_CODERANGE_BROKEN) {
3154             int cr2 = ENC_CODERANGE(repl);
3155             if (cr2 == ENC_CODERANGE_UNKNOWN || cr2 > cr) cr = cr2;
3156         }
3157         plen = END(0) - BEG(0);
3158         if (RSTRING_LEN(repl) > plen) {
3159             RESIZE_CAPA(str, RSTRING_LEN(str) + RSTRING_LEN(repl) - plen);
3160         }
3161         if (RSTRING_LEN(repl) != plen) {
3162             memmove(RSTRING_PTR(str) + BEG(0) + RSTRING_LEN(repl),
3163                     RSTRING_PTR(str) + BEG(0) + plen,
3164                     RSTRING_LEN(str) - BEG(0) - plen);
3165         }
3166         memcpy(RSTRING_PTR(str) + BEG(0),
3167                RSTRING_PTR(repl), RSTRING_LEN(repl));
3168         STR_SET_LEN(str, RSTRING_LEN(str) + RSTRING_LEN(repl) - plen);
3169         RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
3170         ENC_CODERANGE_SET(str, cr);
3171         if (tainted) OBJ_TAINT(str);
3172
3173         return str;
3174     }
3175     return Qnil;
3176 }
3177
3178
3179 /*
3180  *  call-seq:
3181  *     str.sub(pattern, replacement)         => new_str
3182  *     str.sub(pattern) {|match| block }     => new_str
3183  *
3184  *  Returns a copy of <i>str</i> with the <em>first</em> occurrence of
3185  *  <i>pattern</i> replaced with either <i>replacement</i> or the value of the
3186  *  block. The <i>pattern</i> will typically be a <code>Regexp</code>; if it is
3187  *  a <code>String</code> then no regular expression metacharacters will be
3188  *  interpreted (that is <code>/\d/</code> will match a digit, but
3189  *  <code>'\d'</code> will match a backslash followed by a 'd').
3190  *
3191  *  If the method call specifies <i>replacement</i>, special variables such as
3192  *  <code>$&</code> will not be useful, as substitution into the string occurs
3193  *  before the pattern match starts. However, the sequences <code>\1</code>,
3194  *  <code>\2</code>, <code>\k<group_name></code>, etc., may be used.
3195  *
3196  *  In the block form, the current match string is passed in as a parameter, and
3197  *  variables such as <code>$1</code>, <code>$2</code>, <code>$`</code>,
3198  *  <code>$&</code>, and <code>$'</code> will be set appropriately. The value
3199  *  returned by the block will be substituted for the match on each call.
3200  *
3201  *  The result inherits any tainting in the original string or any supplied
3202  *  replacement string.
3203  *
3204  *     "hello".sub(/[aeiou]/, '*')                  #=> "h*llo"
3205  *     "hello".sub(/([aeiou])/, '<\1>')             #=> "h<e>llo"
3206  *     "hello".sub(/./) {|s| s[0].ord.to_s + ' ' }  #=> "104 ello"
3207  *     "hello".sub(/(?<foo>[aeiou])/, '*\k<foo>*')  #=> "h*e*llo"
3208  */
3209
3210 static VALUE
3211 rb_str_sub(int argc, VALUE *argv, VALUE str)
3212 {
3213     str = rb_str_dup(str);
3214     rb_str_sub_bang(argc, argv, str);
3215     return str;
3216 }
3217
3218 static VALUE
3219 str_gsub(int argc, VALUE *argv, VALUE str, int bang)
3220 {
3221     VALUE pat, val, repl, match, dest, hash = Qnil;
3222     struct re_registers *regs;
3223     long beg, n;
3224     long offset, blen, slen, len;
3225     int iter = 0;
3226     char *sp, *cp;
3227     int tainted = 0;
3228     rb_encoding *str_enc;
3229
3230     switch (argc) {
3231       case 1:
3232         RETURN_ENUMERATOR(str, argc, argv);
3233         iter = 1;
3234         break;
3235       case 2:
3236         repl = argv[1];
3237         hash = rb_check_convert_type(argv[1], T_HASH, "Hash", "to_hash");
3238         if (NIL_P(hash)) {
3239             StringValue(repl);
3240         }
3241         if (OBJ_TAINTED(repl)) tainted = 1;
3242         break;
3243       default:
3244         rb_raise(rb_eArgError, "wrong number of arguments (%d for 2)", argc);
3245     }
3246
3247     pat = get_pat(argv[0], 1);
3248     offset=0; n=0;
3249     beg = rb_reg_search(pat, str, 0, 0);
3250     if (beg < 0) {
3251         if (bang) return Qnil;  /* no match, no substitution */
3252         return rb_str_dup(str);
3253     }
3254
3255     blen = RSTRING_LEN(str) + 30; /* len + margin */
3256     dest = rb_str_buf_new(blen);
3257     sp = RSTRING_PTR(str);
3258     slen = RSTRING_LEN(str);
3259     cp = sp;
3260     str_enc = STR_ENC_GET(str);
3261
3262     do {
3263         n++;
3264         match = rb_backref_get();
3265         regs = RMATCH_REGS(match);
3266         if (iter || !NIL_P(hash)) {
3267             if (iter) {
3268                 rb_match_busy(match);
3269                 val = rb_obj_as_string(rb_yield(rb_reg_nth_match(0, match)));
3270             }
3271             else {
3272                 val = rb_hash_aref(hash, rb_str_subseq(str, BEG(0), END(0) - BEG(0)));
3273                 val = rb_obj_as_string(val);
3274             }
3275             str_mod_check(str, sp, slen);
3276             if (bang) str_frozen_check(str);
3277             if (val == dest) {  /* paranoid check [ruby-dev:24827] */
3278                 rb_raise(rb_eRuntimeError, "block should not cheat");
3279             }
3280             if (iter) rb_backref_set(match);
3281         }
3282         else {
3283             val = rb_reg_regsub(repl, str, regs, pat);
3284         }
3285
3286         if (OBJ_TAINTED(val)) tainted = 1;
3287
3288         len = beg - offset;     /* copy pre-match substr */
3289         if (len) {
3290             rb_enc_str_buf_cat(dest, cp, len, str_enc);
3291         }
3292
3293         rb_str_buf_append(dest, val);
3294
3295         offset = END(0);
3296         if (BEG(0) == END(0)) {
3297             /*
3298              * Always consume at least one character of the input string
3299              * in order to prevent infinite loops.
3300              */
3301             if (RSTRING_LEN(str) <= END(0)) break;
3302             len = rb_enc_mbclen(RSTRING_PTR(str)+END(0), RSTRING_END(str), str_enc);
3303             rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+END(0), len, str_enc);
3304             offset = END(0) + len;
3305         }
3306         cp = RSTRING_PTR(str) + offset;
3307         if (offset > RSTRING_LEN(str)) break;
3308         beg = rb_reg_search(pat, str, offset, 0);
3309     } while (beg >= 0);
3310     if (RSTRING_LEN(str) > offset) {
3311         rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
3312     }
3313     rb_backref_set(match);
3314     if (bang) {
3315         rb_str_shared_replace(str, dest);
3316     }
3317     else {
3318         RBASIC(dest)->klass = rb_obj_class(str);
3319         OBJ_INFECT(dest, str);
3320         str = dest;
3321     }
3322
3323     if (tainted) OBJ_TAINT(str);
3324     return str;
3325 }
3326
3327
3328 /*
3329  *  call-seq:
3330  *     str.gsub!(pattern, replacement)        => str or nil
3331  *     str.gsub!(pattern) {|match| block }    => str or nil
3332  *
3333  *  Performs the substitutions of <code>String#gsub</code> in place, returning
3334  *  <i>str</i>, or <code>nil</code> if no substitutions were performed.
3335  */
3336
3337 static VALUE
3338 rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
3339 {
3340     return str_gsub(argc, argv, str, 1);
3341 }
3342
3343
3344 /*
3345  *  call-seq:
3346  *     str.gsub(pattern, replacement)       => new_str
3347  *     str.gsub(pattern) {|match| block }   => new_str
3348  *
3349  *  Returns a copy of <i>str</i> with <em>all</em> occurrences of <i>pattern</i>
3350  *  replaced with either <i>replacement</i> or the value of the block. The
3351  *  <i>pattern</i> will typically be a <code>Regexp</code>; if it is a
3352  *  <code>String</code> then no regular expression metacharacters will be
3353  *  interpreted (that is <code>/\d/</code> will match a digit, but
3354  *  <code>'\d'</code> will match a backslash followed by a 'd').
3355  *
3356  *  If a string is used as the replacement, special variables from the match
3357  *  (such as <code>$&</code> and <code>$1</code>) cannot be substituted into it,
3358  *  as substitution into the string occurs before the pattern match
3359  *  starts. However, the sequences <code>\1</code>, <code>\2</code>,
3360  *  <code>\k<group_name></code>, and so on may be used to interpolate
3361  *  successive groups in the match.
3362  *
3363  *  In the block form, the current match string is passed in as a parameter, and
3364  *  variables such as <code>$1</code>, <code>$2</code>, <code>$`</code>,
3365  *  <code>$&</code>, and <code>$'</code> will be set appropriately. The value
3366  *  returned by the block will be substituted for the match on each call.
3367  *
3368  *  The result inherits any tainting in the original string or any supplied
3369  *  replacement string.
3370  *
3371  *     "hello".gsub(/[aeiou]/, '*')                  #=> "h*ll*"
3372  *     "hello".gsub(/([aeiou])/, '<\1>')             #=> "h<e>ll<o>"
3373  *     "hello".gsub(/./) {|s| s[0].ord.to_s + ' '}   #=> "104 101 108 108 111 "
3374  *     "hello".gsub(/(?<foo>[aeiou])/, '{\k<foo>}')  #=> "h{e}ll{o}"
3375  */
3376
3377 static VALUE
3378 rb_str_gsub(int argc, VALUE *argv, VALUE str)
3379 {
3380     return str_gsub(argc, argv, str, 0);
3381 }
3382
3383
3384 /*
3385  *  call-seq:
3386  *     str.replace(other_str)   => str
3387  *
3388  *  Replaces the contents and taintedness of <i>str</i> with the corresponding
3389  *  values in <i>other_str</i>.
3390  *
3391  *     s = "hello"         #=> "hello"
3392  *     s.replace "world"   #=> "world"
3393  */
3394
3395 static VALUE
3396 rb_str_replace(VALUE str, VALUE str2)
3397 {
3398     long len;
3399     if (str == str2) return str;
3400
3401     StringValue(str2);
3402     len = RSTRING_LEN(str2);
3403     if (STR_ASSOC_P(str2)) {
3404         str2 = rb_str_new4(str2);
3405     }
3406     if (STR_SHARED_P(str2)) {
3407         if (str_independent(str) && !STR_EMBED_P(str)) {
3408             xfree(RSTRING_PTR(str));
3409         }
3410         STR_SET_NOEMBED(str);
3411         RSTRING(str)->as.heap.len = len;
3412         RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
3413         FL_SET(str, ELTS_SHARED);
3414         FL_UNSET(str, STR_ASSOC);
3415         RSTRING(str)->as.heap.aux.shared = RSTRING(str2)->as.heap.aux.shared;
3416     }
3417     else {
3418         rb_str_modify(str);
3419         str_replace_shared(str, rb_str_new4(str2));
3420     }
3421
3422     OBJ_INFECT(str, str2);
3423     rb_enc_cr_str_exact_copy(str, str2);
3424     return str;
3425 }
3426
3427 /*
3428  *  call-seq:
3429  *     string.clear    ->  string
3430  *
3431  *  Makes string empty.
3432  *
3433  *     a = "abcde"
3434  *     a.clear    #=> ""
3435  */
3436
3437 static VALUE
3438 rb_str_clear(VALUE str)
3439 {
3440     /* rb_str_modify() */       /* no need for str_make_independent */
3441     if (str_independent(str) && !STR_EMBED_P(str)) {
3442         xfree(RSTRING_PTR(str));
3443     }
3444     STR_SET_EMBED(str);
3445     STR_SET_EMBED_LEN(str, 0);
3446     RSTRING_PTR(str)[0] = 0;
3447     ENC_CODERANGE_CLEAR(str);
3448     return str;
3449 }
3450
3451 /*
3452  *  call-seq:
3453  *     string.chr    ->  string
3454  *
3455  *  Returns a one-character string at the beginning of the string.
3456  *
3457  *     a = "abcde"
3458  *     a.chr    #=> "a"
3459  */
3460
3461 static VALUE
3462 rb_str_chr(VALUE str)
3463 {
3464     return rb_str_substr(str, 0, 1);
3465 }
3466
3467 /*
3468  *  call-seq:
3469  *     str.getbyte(index)          => 0 .. 255
3470  *
3471  *  returns the <i>index</i>th byte as an integer.
3472  */
3473 static VALUE
3474 rb_str_getbyte(VALUE str, VALUE index)
3475 {
3476     long pos = NUM2LONG(index);
3477
3478     if (pos < 0)
3479         pos += RSTRING_LEN(str);
3480     if (pos < 0 ||  RSTRING_LEN(str) <= pos)
3481         return Qnil;
3482
3483     return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
3484 }
3485
3486 /*
3487  *  call-seq:
3488  *     str.setbyte(index, int) => int
3489  *
3490  *  modifies the <i>index</i>th byte as <i>int</i>.
3491  */
3492 static VALUE
3493 rb_str_setbyte(VALUE str, VALUE index, VALUE value)
3494 {
3495     long pos = NUM2LONG(index);
3496     int byte = NUM2INT(value);
3497
3498     rb_str_modify(str);
3499
3500     if (pos < -RSTRING_LEN(str) || RSTRING_LEN(str) <= pos)
3501         rb_raise(rb_eIndexError, "index %ld out of string", pos);
3502     if (pos < 0)
3503         pos += RSTRING_LEN(str);
3504
3505     RSTRING_PTR(str)[pos] = byte;
3506
3507     return value;
3508 }
3509
3510 /*
3511  *  call-seq:
3512  *     str.reverse   => new_str
3513  *
3514  *  Returns a new string with the characters from <i>str</i> in reverse order.
3515  *
3516  *     "stressed".reverse   #=> "desserts"
3517  */
3518
3519 static VALUE
3520 rb_str_reverse(VALUE str)
3521 {
3522     rb_encoding *enc;
3523     VALUE rev;
3524     char *s, *e, *p;
3525     int single = 1;
3526
3527     if (RSTRING_LEN(str) <= 1) return rb_str_dup(str);
3528     enc = STR_ENC_GET(str);
3529     rev = rb_str_new5(str, 0, RSTRING_LEN(str));
3530     s = RSTRING_PTR(str); e = RSTRING_END(str);
3531     p = RSTRING_END(rev);
3532
3533     if (RSTRING_LEN(str) > 1) {
3534         if (single_byte_optimizable(str)) {
3535             while (s < e) {
3536                 *--p = *s++;
3537             }
3538         }
3539         else {
3540             while (s < e) {
3541                 int clen = rb_enc_mbclen(s, e, enc);
3542
3543                 if (clen > 1 || (*s & 0x80)) single = 0;
3544                 p -= clen;
3545                 memcpy(p, s, clen);
3546                 s += clen;
3547             }
3548         }
3549     }
3550     STR_SET_LEN(rev, RSTRING_LEN(str));
3551     OBJ_INFECT(rev, str);
3552     if (ENC_CODERANGE(str) == ENC_CODERANGE_UNKNOWN) {
3553         if (single) {
3554             ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
3555         }
3556         else {
3557             ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
3558         }
3559     }
3560     rb_enc_cr_str_copy_for_substr(rev, str);
3561
3562     return rev;
3563 }
3564
3565
3566 /*
3567  *  call-seq:
3568  *     str.reverse!   => str
3569  *
3570  *  Reverses <i>str</i> in place.
3571  */
3572
3573 static VALUE
3574 rb_str_reverse_bang(VALUE str)
3575 {
3576     if (RSTRING_LEN(str) > 1) {
3577         if (single_byte_optimizable(str)) {
3578             char *s, *e, c;
3579             int cr = ENC_CODERANGE(str);
3580             int single = 1;
3581
3582             rb_str_modify(str);
3583             s = RSTRING_PTR(str);
3584             e = RSTRING_END(str) - 1;
3585             while (s < e) {
3586                 c = *s;
3587                 if (*s & 0x80) single = 0;
3588                 *s++ = *e;
3589                 *e-- = c;
3590             }
3591             if (cr == ENC_CODERANGE_UNKNOWN && single) {
3592                 cr = ENC_CODERANGE_7BIT;
3593             }
3594             ENC_CODERANGE_SET(str, cr);
3595         }
3596         else {
3597             rb_str_shared_replace(str, rb_str_reverse(str));
3598         }
3599     }
3600     return str;
3601 }
3602
3603
3604 /*
3605  *  call-seq:
3606  *     str.include? other_str   => true or false
3607  *     str.include? fixnum      => true or false
3608  *
3609  *  Returns <code>true</code> if <i>str</i> contains the given string or
3610  *  character.
3611  *
3612  *     "hello".include? "lo"   #=> true
3613  *     "hello".include? "ol"   #=> false
3614  *     "hello".include? ?h     #=> true
3615  */
3616
3617 static VALUE
3618 rb_str_include(VALUE str, VALUE arg)
3619 {
3620     long i;
3621
3622     StringValue(arg);
3623     i = rb_str_index(str, arg, 0);
3624
3625     if (i == -1) return Qfalse;
3626     return Qtrue;
3627 }
3628
3629
3630 /*
3631  *  call-seq:
3632  *     str.to_i(base=10)   => integer
3633  *
3634  *  Returns the result of interpreting leading characters in <i>str</i> as an
3635  *  integer base <i>base</i> (between 2 and 36). Extraneous characters past the
3636  *  end of a valid number are ignored. If there is not a valid number at the
3637  *  start of <i>str</i>, <code>0</code> is returned. This method never raises an
3638  *  exception.
3639  *
3640  *     "12345".to_i             #=> 12345
3641  *     "99 red balloons".to_i   #=> 99
3642  *     "0a".to_i                #=> 0
3643  *     "0a".to_i(16)            #=> 10
3644  *     "hello".to_i             #=> 0
3645  *     "1100101".to_i(2)        #=> 101
3646  *     "1100101".to_i(8)        #=> 294977
3647  *     "1100101".to_i(10)       #=> 1100101
3648  *     "1100101".to_i(16)       #=> 17826049
3649  */
3650
3651 static VALUE
3652 rb_str_to_i(int argc, VALUE *argv, VALUE str)
3653 {
3654     int base;
3655
3656     if (argc == 0) base = 10;
3657     else {
3658         VALUE b;
3659
3660         rb_scan_args(argc, argv, "01", &b);
3661         base = NUM2INT(b);
3662     }
3663     if (base < 0) {
3664         rb_raise(rb_eArgError, "invalid radix %d", base);
3665     }
3666     return rb_str_to_inum(str, base, Qfalse);
3667 }
3668
3669
3670 /*
3671  *  call-seq:
3672  *     str.to_f   => float
3673  *
3674  *  Returns the result of interpreting leading characters in <i>str</i> as a
3675  *  floating point number. Extraneous characters past the end of a valid number
3676  *  are ignored. If there is not a valid number at the start of <i>str</i>,
3677  *  <code>0.0</code> is returned. This method never raises an exception.
3678  *
3679  *     "123.45e1".to_f        #=> 1234.5
3680  *     "45.67 degrees".to_f   #=> 45.67
3681  *     "thx1138".to_f         #=> 0.0
3682  */
3683
3684 static VALUE
3685 rb_str_to_f(VALUE str)
3686 {
3687     return DOUBLE2NUM(rb_str_to_dbl(str, Qfalse));
3688 }
3689
3690
3691 /*
3692  *  call-seq:
3693  *     str.to_s     => str
3694  *     str.to_str   => str
3695  *
3696  *  Returns the receiver.
3697  */
3698
3699 static VALUE
3700 rb_str_to_s(VALUE str)
3701 {
3702     if (rb_obj_class(str) != rb_cString) {
3703         VALUE dup = str_alloc(rb_cString);
3704         rb_str_replace(dup, str);
3705         return dup;
3706     }
3707     return str;
3708 }
3709
3710 static void
3711 str_cat_char(VALUE str, int c, rb_encoding *enc)
3712 {
3713     char s[16];
3714     int n = rb_enc_codelen(c, enc);
3715
3716     rb_enc_mbcput(c, s, enc);
3717     rb_enc_str_buf_cat(str, s, n, enc);
3718 }
3719
3720 static void
3721 prefix_escape(VALUE str, int c, rb_encoding *enc)
3722 {
3723     str_cat_char(str, '\\', enc);
3724     str_cat_char(str, c, enc);
3725 }
3726
3727 /*
3728  * call-seq:
3729  *   str.inspect   => string
3730  *
3731  * Returns a printable version of _str_, surrounded by quote marks,
3732  * with special characters escaped.
3733  *
3734  *    str = "hello"
3735  *    str[3] = "\b"
3736  *    str.inspect       #=> "\"hel\\bo\""
3737  */
3738
3739 VALUE
3740 rb_str_inspect(VALUE str)
3741 {
3742     rb_encoding *enc = STR_ENC_GET(str);
3743     char *p, *pend;
3744     VALUE result = rb_str_buf_new2("");
3745
3746     if (!rb_enc_asciicompat(enc)) enc = rb_usascii_encoding();
3747     rb_enc_associate(result, enc);
3748     str_cat_char(result, '"', enc);
3749     p = RSTRING_PTR(str); pend = RSTRING_END(str);
3750     while (p < pend) {
3751         int c;
3752         int n;
3753         int cc;
3754
3755         n = rb_enc_precise_mbclen(p, pend, enc);
3756         if (!MBCLEN_CHARFOUND_P(n)) {
3757             p++;
3758             n = 1;
3759             goto escape_codepoint;
3760         }
3761         n = MBCLEN_CHARFOUND_LEN(n);
3762
3763         c = rb_enc_codepoint(p, pend, enc);
3764         n = rb_enc_codelen(c, enc);
3765
3766         p += n;
3767         if (c == '"'|| c == '\\' ||
3768             (c == '#' &&
3769              p < pend &&
3770              MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p,pend,enc)) &&
3771              (cc = rb_enc_codepoint(p,pend,enc),
3772               (cc == '$' || cc == '@' || cc == '{')))) {
3773             prefix_escape(result, c, enc);
3774         }
3775         else if (c == '\n') {
3776             prefix_escape(result, 'n', enc);
3777         }
3778         else if (c == '\r') {
3779             prefix_escape(result, 'r', enc);
3780         }
3781         else if (c == '\t') {
3782             prefix_escape(result, 't', enc);
3783         }
3784         else if (c == '\f') {
3785             prefix_escape(result, 'f', enc);
3786         }
3787         else if (c == '\013') {
3788             prefix_escape(result, 'v', enc);
3789         }
3790         else if (c == '\010') {
3791             prefix_escape(result, 'b', enc);
3792         }
3793         else if (c == '\007') {
3794             prefix_escape(result, 'a', enc);
3795         }
3796         else if (c == 033) {
3797             prefix_escape(result, 'e', enc);
3798         }
3799         else if (rb_enc_isprint(c, enc)) {
3800             rb_enc_str_buf_cat(result, p-n, n, enc);
3801         }
3802         else {
3803             char buf[5];
3804             char *s;
3805             char *q;
3806
3807           escape_codepoint:
3808             for (q = p-n; q < p; q++) {
3809                 s = buf;
3810                 sprintf(buf, "\\x%02X", *q & 0377);
3811                 while (*s) {
3812                     str_cat_char(result, *s++, enc);
3813                 }
3814             }
3815         }
3816     }
3817     str_cat_char(result, '"', enc);
3818
3819     OBJ_INFECT(result, str);
3820     return result;
3821 }
3822
3823 #define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
3824
3825 /*
3826  *  call-seq:
3827  *     str.dump   => new_str
3828  *
3829  *  Produces a version of <i>str</i> with all nonprinting characters replaced by
3830  *  <code>\nnn</code> notation and all special characters escaped.
3831  */
3832
3833 VALUE
3834 rb_str_dump(VALUE str)
3835 {
3836     rb_encoding *enc0 = rb_enc_get(str);
3837     long len;
3838     const char *p, *pend;
3839     char *q, *qend;
3840     VALUE result;
3841
3842     len = 2;                    /* "" */
3843     p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
3844     while (p < pend) {
3845         unsigned char c = *p++;
3846         switch (c) {
3847           case '"':  case '\\':
3848           case '\n': case '\r':
3849           case '\t': case '\f':
3850           case '\013': case '\010': case '\007': case '\033':
3851             len += 2;
3852             break;
3853
3854           case '#':
3855             len += IS_EVSTR(p, pend) ? 2 : 1;
3856             break;
3857
3858           default:
3859             if (ISPRINT(c)) {
3860                 len++;
3861             }
3862             else {
3863                 len += 4;               /* \xNN */
3864             }
3865             break;
3866         }
3867     }
3868     if (!rb_enc_asciicompat(enc0)) {
3869         len += 19;              /* ".force_encoding('')" */
3870         len += strlen(enc0->name);
3871     }
3872
3873     result = rb_str_new5(str, 0, len);
3874     p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
3875     q = RSTRING_PTR(result); qend = q + len;
3876
3877     *q++ = '"';
3878     while (p < pend) {
3879         unsigned char c = *p++;
3880
3881         if (c == '"' || c == '\\') {
3882             *q++ = '\\';
3883             *q++ = c;
3884         }
3885         else if (c == '#') {
3886             if (IS_EVSTR(p, pend)) *q++ = '\\';
3887             *q++ = '#';
3888         }
3889         else if (c == '\n') {
3890             *q++ = '\\';
3891             *q++ = 'n';
3892         }
3893         else if (c == '\r') {
3894             *q++ = '\\';
3895             *q++ = 'r';
3896         }
3897         else if (c == '\t') {
3898             *q++ = '\\';
3899             *q++ = 't';
3900         }
3901         else if (c == '\f') {
3902             *q++ = '\\';
3903             *q++ = 'f';
3904         }
3905         else if (c == '\013') {
3906             *q++ = '\\';
3907             *q++ = 'v';
3908         }
3909         else if (c == '\010') {
3910             *q++ = '\\';
3911             *q++ = 'b';
3912         }
3913         else if (c == '\007') {
3914             *q++ = '\\';
3915             *q++ = 'a';
3916         }
3917         else if (c == '\033') {
3918             *q++ = '\\';
3919             *q++ = 'e';
3920         }
3921         else if (ISPRINT(c)) {
3922             *q++ = c;
3923         }
3924         else {
3925             *q++ = '\\';
3926             sprintf(q, "x%02X", c);
3927             q += 3;
3928         }
3929     }
3930     *q++ = '"';
3931     if (!rb_enc_asciicompat(enc0)) {
3932         sprintf(q, ".force_encoding(\"%s\")", enc0->name);
3933         enc0 = rb_ascii8bit_encoding();
3934     }
3935
3936     OBJ_INFECT(result, str);
3937     /* result from dump is ASCII */
3938     rb_enc_associate(result, enc0);
3939     return result;
3940 }
3941
3942
3943 /*
3944  *  call-seq:
3945  *     str.upcase!   => str or nil
3946  *
3947  *  Upcases the contents of <i>str</i>, returning <code>nil</code> if no changes
3948  *  were made.
3949  *  Note: case replacement is effective only in ASCII region.
3950  */
3951
3952 static VALUE
3953 rb_str_upcase_bang(VALUE str)
3954 {
3955     rb_encoding *enc;
3956     char *s, *send;
3957     int modify = 0;
3958     int cr = ENC_CODERANGE(str);
3959
3960     rb_str_modify(str);
3961     enc = STR_ENC_GET(str);
3962     s = RSTRING_PTR(str); send = RSTRING_END(str);
3963     while (s < send) {
3964         int c = rb_enc_codepoint(s, send, enc);
3965
3966         if (rb_enc_islower(c, enc)) {
3967             /* assuming toupper returns codepoint with same size */
3968             rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
3969             modify = 1;
3970         }
3971         s += rb_enc_codelen(c, enc);
3972     }
3973
3974     ENC_CODERANGE_SET(str, cr);
3975     if (modify) return str;
3976     return Qnil;
3977 }
3978
3979
3980 /*
3981  *  call-seq:
3982  *     str.upcase   => new_str
3983  *
3984  *  Returns a copy of <i>str</i> with all lowercase letters replaced with their
3985  *  uppercase counterparts. The operation is locale insensitive---only
3986  *  characters ``a'' to ``z'' are affected.
3987  *  Note: case replacement is effective only in ASCII region.
3988  *
3989  *     "hEllO".upcase   #=> "HELLO"
3990  */
3991
3992 static VALUE
3993 rb_str_upcase(VALUE str)
3994 {
3995     str = rb_str_dup(str);
3996     rb_str_upcase_bang(str);
3997     return str;
3998 }
3999
4000
4001 /*
4002  *  call-seq:
4003  *     str.downcase!   => str or nil
4004  *
4005  *  Downcases the contents of <i>str</i>, returning <code>nil</code> if no
4006  *  changes were made.
4007  *  Note: case replacement is effective only in ASCII region.
4008  */
4009
4010 static VALUE
4011 rb_str_downcase_bang(VALUE str)
4012 {
4013     rb_encoding *enc;
4014     char *s, *send;
4015     int modify = 0;
4016     int cr = ENC_CODERANGE(str);
4017
4018     rb_str_modify(str);
4019     enc = STR_ENC_GET(str);
4020     s = RSTRING_PTR(str); send = RSTRING_END(str);
4021     while (s < send) {
4022         int c = rb_enc_codepoint(s, send, enc);
4023
4024         if (rb_enc_isupper(c, enc)) {
4025             /* assuming toupper returns codepoint with same size */
4026             rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
4027             modify = 1;
4028         }
4029         s += rb_enc_codelen(c, enc);
4030     }
4031
4032     ENC_CODERANGE_SET(str, cr);
4033     if (modify) return str;
4034     return Qnil;
4035 }
4036
4037
4038 /*
4039  *  call-seq:
4040  *     str.downcase   => new_str
4041  *
4042  *  Returns a copy of <i>str</i> with all uppercase letters replaced with their
4043  *  lowercase counterparts. The operation is locale insensitive---only
4044  *  characters ``A'' to ``Z'' are affected.
4045  *  Note: case replacement is effective only in ASCII region.
4046  *
4047  *     "hEllO".downcase   #=> "hello"
4048  */
4049
4050 static VALUE
4051 rb_str_downcase(VALUE str)
4052 {
4053     str = rb_str_dup(str);
4054     rb_str_downcase_bang(str);
4055     return str;
4056 }
4057
4058
4059 /*
4060  *  call-seq:
4061  *     str.capitalize!   => str or nil
4062  *
4063  *  Modifies <i>str</i> by converting the first character to uppercase and the
4064  *  remainder to lowercase. Returns <code>nil</code> if no changes are made.
4065  *  Note: case conversion is effective only in ASCII region.
4066  *
4067  *     a = "hello"
4068  *     a.capitalize!   #=> "Hello"
4069  *     a               #=> "Hello"
4070  *     a.capitalize!   #=> nil
4071  */
4072
4073 static VALUE
4074 rb_str_capitalize_bang(VALUE str)
4075 {
4076     rb_encoding *enc;
4077     char *s, *send;
4078     int modify = 0;
4079     int c;
4080     int cr = ENC_CODERANGE(str);
4081
4082     rb_str_modify(str);
4083     enc = STR_ENC_GET(str);
4084     if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
4085     s = RSTRING_PTR(str); send = RSTRING_END(str);
4086
4087     c = rb_enc_codepoint(s, send, enc);
4088     if (rb_enc_islower(c, enc)) {
4089         rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
4090         modify = 1;
4091     }
4092     s += rb_enc_codelen(c, enc);
4093     while (s < send) {
4094         c = rb_enc_codepoint(s, send, enc);
4095         if (rb_enc_isupper(c, enc)) {
4096             rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
4097             modify = 1;
4098         }
4099         s += rb_enc_codelen(c, enc);
4100     }
4101
4102     ENC_CODERANGE_SET(str, cr);
4103     if (modify) return str;
4104     return Qnil;
4105 }
4106
4107
4108 /*
4109  *  call-seq:
4110  *     str.capitalize   => new_str
4111  *
4112  *  Returns a copy of <i>str</i> with the first character converted to uppercase
4113  *  and the remainder to lowercase.
4114  *  Note: case conversion is effective only in ASCII region.
4115  *
4116  *     "hello".capitalize    #=> "Hello"
4117  *     "HELLO".capitalize    #=> "Hello"
4118  *     "123ABC".capitalize   #=> "123abc"
4119  */
4120
4121 static VALUE
4122 rb_str_capitalize(VALUE str)
4123 {
4124     str = rb_str_dup(str);
4125     rb_str_capitalize_bang(str);
4126     return str;
4127 }
4128
4129
4130 /*
4131  *  call-seq:
4132 *     str.swapcase!   => str or nil
4133  *
4134  *  Equivalent to <code>String#swapcase</code>, but modifies the receiver in
4135  *  place, returning <i>str</i>, or <code>nil</code> if no changes were made.
4136  *  Note: case conversion is effective only in ASCII region.
4137  */
4138
4139 static VALUE
4140 rb_str_swapcase_bang(VALUE str)
4141 {
4142     rb_encoding *enc;
4143     char *s, *send;
4144     int modify = 0;
4145     int cr = ENC_CODERANGE(str);
4146
4147     rb_str_modify(str);
4148     enc = STR_ENC_GET(str);
4149     s = RSTRING_PTR(str); send = RSTRING_END(str);
4150     while (s < send) {
4151         int c = rb_enc_codepoint(s, send, enc);
4152
4153         if (rb_enc_isupper(c, enc)) {
4154             /* assuming toupper returns codepoint with same size */
4155             rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
4156             modify = 1;
4157         }
4158         else if (rb_enc_islower(c, enc)) {
4159             /* assuming toupper returns codepoint with same size */
4160             rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
4161             modify = 1;
4162         }
4163         s += rb_enc_codelen(c, enc);
4164     }
4165
4166     ENC_CODERANGE_SET(str, cr);
4167     if (modify) return str;
4168     return Qnil;
4169 }
4170
4171
4172 /*
4173  *  call-seq:
4174  *     str.swapcase   => new_str
4175  *
4176  *  Returns a copy of <i>str</i> with uppercase alphabetic characters converted
4177  *  to lowercase and lowercase characters converted to uppercase.
4178  *  Note: case conversion is effective only in ASCII region.
4179  *
4180  *     "Hello".swapcase          #=> "hELLO"
4181  *     "cYbEr_PuNk11".swapcase   #=> "CyBeR_pUnK11"
4182  */
4183
4184 static VALUE
4185 rb_str_swapcase(VALUE str)
4186 {
4187     str = rb_str_dup(str);
4188     rb_str_swapcase_bang(str);
4189     return str;
4190 }
4191
4192 typedef unsigned char *USTR;
4193
4194 struct tr {
4195     int gen, now, max;
4196     char *p, *pend;
4197 };
4198
4199 static int
4200 trnext(struct tr *t, rb_encoding *enc)
4201 {
4202     for (;;) {
4203         if (!t->gen) {
4204             if (t->p == t->pend) return -1;
4205             if (t->p < t->pend - 1 && *t->p == '\\') {
4206                 t->p++;
4207             }
4208             t->now = rb_enc_codepoint(t->p, t->pend, enc);
4209             t->p += rb_enc_codelen(t->now, enc);
4210             if (t->p < t->pend - 1 && *t->p == '-') {
4211                 t->p++;
4212                 if (t->p < t->pend) {
4213                     int c = rb_enc_codepoint(t->p, t->pend, enc);
4214                     t->p += rb_enc_codelen(c, enc);
4215                     if (t->now > c) continue;
4216                     t->gen = 1;
4217                     t->max = c;
4218                 }
4219             }
4220             return t->now;
4221         }
4222         else if (++t->now < t->max) {
4223             return t->now;
4224         }
4225         else {
4226             t->gen = 0;
4227             return t->max;
4228         }
4229     }
4230 }
4231
4232 static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
4233
4234 static VALUE
4235 tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
4236 {
4237     int trans[256];
4238     rb_encoding *enc, *e1, *e2;
4239     struct tr trsrc, trrepl;
4240     int cflag = 0;
4241     int c, c0, last = 0, modify = 0, i, l;
4242     char *s, *send;
4243     VALUE hash = 0;
4244     int singlebyte = single_byte_optimizable(str);
4245
4246     StringValue(src);
4247     StringValue(repl);
4248     if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
4249     if (RSTRING_LEN(repl) == 0) {
4250         return rb_str_delete_bang(1, &src, str);
4251     }
4252
4253     e1 = rb_enc_check(str, src);
4254     e2 = rb_enc_check(str, repl);
4255     if (e1 == e2) {
4256         enc = e1;
4257     }
4258     else {
4259         enc = rb_enc_check(src, repl);
4260     }
4261     trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
4262     if (RSTRING_LEN(src) > 1 &&
4263         rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
4264         trsrc.p + l < trsrc.pend) {
4265         cflag = 1;
4266         trsrc.p += l;
4267     }
4268     trrepl.p = RSTRING_PTR(repl);
4269     trrepl.pend = trrepl.p + RSTRING_LEN(repl);
4270     trsrc.gen = trrepl.gen = 0;
4271     trsrc.now = trrepl.now = 0;
4272     trsrc.max = trrepl.max = 0;
4273
4274     if (cflag) {
4275         for (i=0; i<256; i++) {
4276             trans[i] = 1;
4277         }
4278         while ((c = trnext(&trsrc, enc)) >= 0) {
4279             if (c < 256) {
4280                 trans[c] = -1;
4281             }
4282             else {
4283                 if (!hash) hash = rb_hash_new();
4284                 rb_hash_aset(hash, INT2NUM(c), Qtrue);
4285             }
4286         }
4287         while ((c = trnext(&trrepl, enc)) >= 0)
4288             /* retrieve last replacer */;
4289         last = trrepl.now;
4290         for (i=0; i<256; i++) {
4291             if (trans[i] >= 0) {
4292                 trans[i] = last;
4293             }
4294         }
4295     }
4296     else {
4297         int r;
4298
4299         for (i=0; i<256; i++) {
4300             trans[i] = -1;
4301         }
4302         while ((c = trnext(&trsrc, enc)) >= 0) {
4303             r = trnext(&trrepl, enc);
4304             if (r == -1) r = trrepl.now;
4305             if (c < 256) {
4306                 trans[c] = r;
4307                 if (r > 255) singlebyte = 0;
4308             }
4309             else {
4310                 if (!hash) hash = rb_hash_new();
4311                 rb_hash_aset(hash, INT2NUM(c), INT2NUM(r));
4312             }
4313         }
4314     }
4315
4316     rb_str_modify(str);
4317     s = RSTRING_PTR(str); send = RSTRING_END(str);
4318     if (sflag) {
4319         int clen, tlen, max = RSTRING_LEN(str);
4320         int offset, save = -1;
4321         char *buf = ALLOC_N(char, max), *t = buf;
4322
4323         while (s < send) {
4324             c0 = c = rb_enc_codepoint(s, send, enc);
4325             tlen = clen = rb_enc_codelen(c, enc);
4326
4327             s += clen;
4328             if (c < 256) {
4329                 c = trans[c];
4330             }
4331             else if (hash) {
4332                 VALUE tmp = rb_hash_lookup(hash, INT2NUM(c));
4333                 if (NIL_P(tmp)) {
4334                     if (cflag) c = last;
4335                     else c = -1;
4336                 }
4337                 else if (cflag) c = -1;
4338                 else c = NUM2INT(tmp);
4339             }
4340             else {
4341                 c = -1;
4342             }
4343             if (c >= 0) {
4344                 if (save == c) continue;
4345                 save = c;
4346                 tlen = rb_enc_codelen(c, enc);
4347                 modify = 1;
4348             }
4349             else {
4350                 save = -1;
4351                 c = c0;
4352             }
4353             while (t - buf + tlen >= max) {
4354                 offset = t - buf;
4355                 max *= 2;
4356                 REALLOC_N(buf, char, max);
4357                 t = buf + offset;
4358             }
4359             rb_enc_mbcput(c, t, enc);
4360             t += tlen;
4361         }
4362         *t = '\0';
4363         RSTRING(str)->as.heap.ptr = buf;
4364         RSTRING(str)->as.heap.len = t - buf;
4365         STR_SET_NOEMBED(str);
4366         RSTRING(str)->as.heap.aux.capa = max;
4367     }
4368     else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
4369         while (s < send) {
4370             c = (unsigned char)*s;
4371             if (trans[c] >= 0) {
4372                 if (!cflag) {
4373                     c = trans[c];
4374                     *s = c;
4375                     modify = 1;
4376                 }
4377                 else {
4378                     *s = last;
4379                     modify = 1;
4380                 }
4381             }
4382             s++;
4383         }
4384     }
4385     else {
4386         int clen, tlen, max = RSTRING_LEN(str) * 1.2;
4387         int offset;
4388         char *buf = ALLOC_N(char, max), *t = buf;
4389
4390         while (s < send) {
4391             c0 = c = rb_enc_codepoint(s, send, enc);
4392             tlen = clen = rb_enc_codelen(c, enc);
4393
4394             if (c < 256) {
4395                 c = trans[c];
4396             }
4397             else if (hash) {
4398                 VALUE tmp = rb_hash_lookup(hash, INT2NUM(c));
4399                 if (NIL_P(tmp)) {
4400                     if (cflag) c = last;
4401                     else c = -1;
4402                 }
4403                 else if (cflag) c = -1;
4404                 else c = NUM2INT(tmp);
4405             }
4406             else {
4407                 c = -1;
4408             }
4409             if (c >= 0) {
4410                 tlen = rb_enc_codelen(c, enc);
4411                 modify = 1;
4412             }
4413             else {
4414                 modify = 1;
4415                 c = c0;
4416             }
4417             while (t - buf + tlen >= max) {
4418                 offset = t - buf;
4419                 max *= 2;
4420                 REALLOC_N(buf, char, max);
4421                 t = buf + offset;
4422             }
4423             if (s != t) rb_enc_mbcput(c, t, enc);
4424             s += clen;
4425             t += tlen;
4426         }
4427         if (!STR_EMBED_P(str)) {
4428             xfree(RSTRING(str)->as.heap.ptr);
4429         }
4430         *t = '\0';
4431         RSTRING(str)->as.heap.ptr = buf;
4432         RSTRING(str)->as.heap.len = t - buf;
4433         STR_SET_NOEMBED(str);
4434         RSTRING(str)->as.heap.aux.capa = max;
4435     }
4436
4437     if (modify) {
4438         rb_enc_associate(str, enc);
4439         return str;
4440     }
4441     return Qnil;
4442 }
4443
4444
4445 /*
4446  *  call-seq:
4447  *     str.tr!(from_str, to_str)   => str or nil
4448  *
4449  *  Translates <i>str</i> in place, using the same rules as
4450  *  <code>String#tr</code>. Returns <i>str</i>, or <code>nil</code> if no
4451  *  changes were made.
4452  */
4453
4454 static VALUE
4455 rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
4456 {
4457     return tr_trans(str, src, repl, 0);
4458 }
4459
4460
4461 /*
4462  *  call-seq:
4463  *     str.tr(from_str, to_str)   => new_str
4464  *
4465  *  Returns a copy of <i>str</i> with the characters in <i>from_str</i> replaced
4466  *  by the corresponding characters in <i>to_str</i>. If <i>to_str</i> is
4467  *  shorter than <i>from_str</i>, it is padded with its last character. Both
4468  *  strings may use the c1--c2 notation to denote ranges of characters, and
4469  *  <i>from_str</i> may start with a <code>^</code>, which denotes all
4470  *  characters except those listed.
4471  *
4472  *     "hello".tr('aeiou', '*')    #=> "h*ll*"
4473  *     "hello".tr('^aeiou', '*')   #=> "*e**o"
4474  *     "hello".tr('el', 'ip')      #=> "hippo"
4475  *     "hello".tr('a-y', 'b-z')    #=> "ifmmp"
4476  */
4477
4478 static VALUE
4479 rb_str_tr(VALUE str, VALUE src, VALUE repl)
4480 {
4481     str = rb_str_dup(str);
4482     tr_trans(str, src, repl, 0);
4483     return str;
4484 }
4485
4486 static void
4487 tr_setup_table(VALUE str, char stable[256], int first,
4488                VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
4489 {
4490     char buf[256];
4491     struct tr tr;
4492     int c, l;
4493     VALUE table = 0, ptable = 0;
4494     int i, cflag = 0;
4495
4496     tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
4497     tr.gen = tr.now = tr.max = 0;
4498
4499     if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
4500         cflag = 1;
4501         tr.p += l;
4502     }
4503     if (first) {
4504         for (i=0; i<256; i++) {
4505             stable[i] = 1;
4506         }
4507     }
4508     for (i=0; i<256; i++) {
4509         buf[i] = cflag;
4510     }
4511
4512     while ((c = trnext(&tr, enc)) >= 0) {
4513         if (c < 256) {
4514             buf[c & 0xff] = !cflag;
4515         }
4516         else {
4517             VALUE key = INT2NUM(c);
4518
4519             if (!table) {
4520                 table = rb_hash_new();
4521                 if (cflag) {
4522                     ptable = *ctablep;
4523                     *ctablep = table;
4524                 }
4525                 else {
4526                     ptable = *tablep;
4527                     *tablep = table;
4528                 }
4529             }
4530             if (!ptable || !NIL_P(rb_hash_aref(ptable, key))) {
4531                 rb_hash_aset(table, key, Qtrue);
4532             }
4533         }
4534     }
4535     for (i=0; i<256; i++) {
4536         stable[i] = stable[i] && buf[i];
4537     }
4538 }
4539
4540
4541 static int
4542 tr_find(int c, char table[256], VALUE del, VALUE nodel)
4543 {
4544     if (c < 256) {
4545         return table[c] ? Qtrue : Qfalse;
4546     }
4547     else {
4548         VALUE v = INT2NUM(c);
4549
4550         if (del && !NIL_P(rb_hash_lookup(del, v))) {
4551             if (!nodel || NIL_P(rb_hash_lookup(nodel, v))) {
4552                 return Qtrue;
4553             }
4554         }
4555         return Qfalse;
4556     }
4557 }
4558
4559 /*
4560  *  call-seq:
4561  *     str.delete!([other_str]+)   => str or nil
4562  *
4563  *  Performs a <code>delete</code> operation in place, returning <i>str</i>, or
4564  *  <code>nil</code> if <i>str</i> was not modified.
4565  */
4566
4567 static VALUE
4568 rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
4569 {
4570     char squeez[256];
4571     rb_encoding *enc = 0;
4572     char *s, *send, *t;
4573     VALUE del = 0, nodel = 0;
4574     int modify = 0;
4575     int i;
4576     int cr;
4577
4578     if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
4579     cr = ENC_CODERANGE(str);
4580     if (argc < 1) {
4581         rb_raise(rb_eArgError, "wrong number of arguments");
4582     }
4583     for (i=0; i<argc; i++) {
4584         VALUE s = argv[i];
4585
4586         StringValue(s);
4587         enc = rb_enc_check(str, s);
4588         tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
4589     }
4590
4591     rb_str_modify(str);
4592     s = t = RSTRING_PTR(str);
4593     if (!s || RSTRING_LEN(str) == 0) return Qnil;
4594     send = RSTRING_END(str);
4595     while (s < send) {
4596         int c = rb_enc_codepoint(s, send, enc);
4597         int clen = rb_enc_codelen(c, enc);
4598
4599         if (tr_find(c, squeez, del, nodel)) {
4600             modify = 1;
4601         }
4602         else {
4603             if (t != s) rb_enc_mbcput(c, t, enc);
4604             t += clen;
4605         }
4606         s += clen;
4607     }
4608     *t = '\0';
4609     STR_SET_LEN(str, t - RSTRING_PTR(str));
4610
4611     ENC_CODERANGE_SET(str, cr);
4612     if (modify) return str;
4613     return Qnil;
4614 }
4615
4616
4617 /*
4618  *  call-seq:
4619  *     str.delete([other_str]+)   => new_str
4620  *
4621  *  Returns a copy of <i>str</i> with all characters in the intersection of its
4622  *  arguments deleted. Uses the same rules for building the set of characters as
4623  *  <code>String#count</code>.
4624  *
4625  *     "hello".delete "l","lo"        #=> "heo"
4626  *     "hello".delete "lo"            #=> "he"
4627  *     "hello".delete "aeiou", "^e"   #=> "hell"
4628  *     "hello".delete "ej-m"          #=> "ho"
4629  */
4630
4631 static VALUE
4632 rb_str_delete(int argc, VALUE *argv, VALUE str)
4633 {
4634     str = rb_str_dup(str);
4635     rb_str_delete_bang(argc, argv, str);
4636     return str;
4637 }
4638
4639
4640 /*
4641  *  call-seq:
4642  *     str.squeeze!([other_str]*)   => str or nil
4643  *
4644  *  Squeezes <i>str</i> in place, returning either <i>str</i>, or
4645  *  <code>nil</code> if no changes were made.
4646  */
4647
4648 static VALUE
4649 rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
4650 {
4651     char squeez[256];
4652     rb_encoding *enc = 0;
4653     VALUE del = 0, nodel = 0;
4654     char *s, *send, *t;
4655     int save, modify = 0;
4656     int i;
4657
4658     if (argc == 0) {
4659         enc = STR_ENC_GET(str);
4660     }
4661     else {
4662         for (i=0; i<argc; i++) {
4663             VALUE s = argv[i];
4664
4665             StringValue(s);
4666             enc = rb_enc_check(str, s);
4667             tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
4668         }
4669     }
4670
4671     rb_str_modify(str);
4672     s = t = RSTRING_PTR(str);
4673     if (!s || RSTRING_LEN(str) == 0) return Qnil;
4674     send = RSTRING_END(str);
4675     save = -1;
4676     while (s < send) {
4677         int c = rb_enc_codepoint(s, send, enc);
4678         int clen = rb_enc_codelen(c, enc);
4679
4680         if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
4681             if (t != s) rb_enc_mbcput(c, t, enc);
4682             save = c;
4683             t += clen;
4684         }
4685         s += clen;
4686     }
4687     *t = '\0';
4688     if (t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
4689         STR_SET_LEN(str, t - RSTRING_PTR(str));
4690         modify = 1;
4691     }
4692
4693     if (modify) return str;
4694     return Qnil;
4695 }
4696
4697
4698 /*
4699  *  call-seq:
4700  *     str.squeeze([other_str]*)    => new_str
4701  *
4702  *  Builds a set of characters from the <i>other_str</i> parameter(s) using the
4703  *  procedure described for <code>String#count</code>. Returns a new string
4704  *  where runs of the same character that occur in this set are replaced by a
4705  *  single character. If no arguments are given, all runs of identical
4706  *  characters are replaced by a single character.
4707  *
4708  *     "yellow moon".squeeze                  #=> "yelow mon"
4709  *     "  now   is  the".squeeze(" ")         #=> " now is the"
4710  *     "putters shoot balls".squeeze("m-z")   #=> "puters shot balls"
4711  */
4712
4713 static VALUE
4714 rb_str_squeeze(int argc, VALUE *argv, VALUE str)
4715 {
4716     str = rb_str_dup(str);
4717     rb_str_squeeze_bang(argc, argv, str);
4718     return str;
4719 }
4720
4721
4722 /*
4723  *  call-seq:
4724  *     str.tr_s!(from_str, to_str)   => str or nil
4725  *
4726  *  Performs <code>String#tr_s</code> processing on <i>str</i> in place,
4727  *  returning <i>str</i>, or <code>nil</code> if no changes were made.
4728  */
4729
4730 static VALUE
4731 rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
4732 {
4733     return tr_trans(str, src, repl, 1);
4734 }
4735
4736
4737 /*
4738  *  call-seq:
4739  *     str.tr_s(from_str, to_str)   => new_str
4740  *
4741  *  Processes a copy of <i>str</i> as described under <code>String#tr</code>,
4742  *  then removes duplicate characters in regions that were affected by the
4743  *  translation.
4744  *
4745  *     "hello".tr_s('l', 'r')     #=> "hero"
4746  *     "hello".tr_s('el', '*')    #=> "h*o"
4747  *     "hello".tr_s('el', 'hx')   #=> "hhxo"
4748  */
4749
4750 static VALUE
4751 rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
4752 {
4753     str = rb_str_dup(str);
4754     tr_trans(str, src, repl, 1);
4755     return str;
4756 }
4757
4758
4759 /*
4760  *  call-seq:
4761  *     str.count([other_str]+)   => fixnum
4762  *
4763  *  Each <i>other_str</i> parameter defines a set of characters to count.  The
4764  *  intersection of these sets defines the characters to count in
4765  *  <i>str</i>. Any <i>other_str</i> that starts with a caret (^) is
4766  *  negated. The sequence c1--c2 means all characters between c1 and c2.
4767  *
4768  *     a = "hello world"
4769  *     a.count "lo"            #=> 5
4770  *     a.count "lo", "o"       #=> 2
4771  *     a.count "hello", "^l"   #=> 4
4772  *     a.count "ej-m"          #=> 4
4773  */
4774
4775 static VALUE
4776 rb_str_count(int argc, VALUE *argv, VALUE str)
4777 {
4778     char table[256];
4779     rb_encoding *enc = 0;
4780     VALUE del = 0, nodel = 0;
4781     char *s, *send;
4782     int i;
4783
4784     if (argc < 1) {
4785         rb_raise(rb_eArgError, "wrong number of arguments");
4786     }
4787     for (i=0; i<argc; i++) {
4788         VALUE s = argv[i];
4789
4790         StringValue(s);
4791         enc = rb_enc_check(str, s);
4792         tr_setup_table(s, table,i==0, &del, &nodel, enc);
4793     }
4794
4795     s = RSTRING_PTR(str);
4796     if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
4797     send = RSTRING_END(str);
4798     i = 0;
4799     while (s < send) {
4800         int c = rb_enc_codepoint(s, send, enc);
4801         int clen = rb_enc_codelen(c, enc);
4802
4803         if (tr_find(c, table, del, nodel)) {
4804             i++;
4805         }
4806         s += clen;
4807     }
4808     return INT2NUM(i);
4809 }
4810
4811
4812 /*
4813  *  call-seq:
4814  *     str.split(pattern=$;, [limit])   => anArray
4815  *
4816  *  Divides <i>str</i> into substrings based on a delimiter, returning an array
4817  *  of these substrings.
4818  *
4819  *  If <i>pattern</i> is a <code>String</code>, then its contents are used as
4820  *  the delimiter when splitting <i>str</i>. If <i>pattern</i> is a single
4821  *  space, <i>str</i> is split on whitespace, with leading whitespace and runs
4822  *  of contiguous whitespace characters ignored.
4823  *
4824  *  If <i>pattern</i> is a <code>Regexp</code>, <i>str</i> is divided where the
4825  *  pattern matches. Whenever the pattern matches a zero-length string,
4826  *  <i>str</i> is split into individual characters. If <i>pattern</i> contains
4827  *  groups, the respective matches will be returned in the array as well.
4828  *
4829  *  If <i>pattern</i> is omitted, the value of <code>$;</code> is used.  If
4830  *  <code>$;</code> is <code>nil</code> (which is the default), <i>str</i> is
4831  *  split on whitespace as if ` ' were specified.
4832  *
4833  *  If the <i>limit</i> parameter is omitted, trailing null fields are
4834  *  suppressed. If <i>limit</i> is a positive number, at most that number of
4835  *  fields will be returned (if <i>limit</i> is <code>1</code>, the entire
4836  *  string is returned as the only entry in an array). If negative, there is no
4837  *  limit to the number of fields returned, and trailing null fields are not
4838  *  suppressed.
4839  *
4840  *     " now's  the time".split        #=> ["now's", "the", "time"]
4841  *     " now's  the time".split(' ')   #=> ["now's", "the", "time"]
4842  *     " now's  the time".split(/ /)   #=> ["", "now's", "", "the", "time"]
4843  *     "1, 2.34,56, 7".split(%r{,\s*}) #=> ["1", "2.34", "56", "7"]
4844  *     "hello".split(//)               #=> ["h", "e", "l", "l", "o"]
4845  *     "hello".split(//, 3)            #=> ["h", "e", "llo"]
4846  *     "hi mom".split(%r{\s*})         #=> ["h", "i", "m", "o", "m"]
4847  *
4848  *     "mellow yellow".split("ello")   #=> ["m", "w y", "w"]
4849  *     "1,2,,3,4,,".split(',')         #=> ["1", "2", "", "3", "4"]
4850  *     "1,2,,3,4,,".split(',', 4)      #=> ["1", "2", "", "3,4,,"]
4851  *     "1,2,,3,4,,".split(',', -4)     #=> ["1", "2", "", "3", "4", "", ""]
4852  */
4853
4854 static VALUE
4855 rb_str_split_m(int argc, VALUE *argv, VALUE str)
4856 {
4857     rb_encoding *enc;
4858     VALUE spat;
4859     VALUE limit;
4860     int awk_split = Qfalse;
4861     long beg, end, i = 0;
4862     int lim = 0;
4863     VALUE result, tmp;
4864
4865     if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
4866         lim = NUM2INT(limit);
4867         if (lim <= 0) limit = Qnil;
4868         else if (lim == 1) {
4869             if (RSTRING_LEN(str) == 0)
4870                 return rb_ary_new2(0);
4871             return rb_ary_new3(1, str);
4872         }
4873         i = 1;
4874     }
4875
4876     enc = STR_ENC_GET(str);
4877     if (NIL_P(spat)) {
4878         if (!NIL_P(rb_fs)) {
4879             spat = rb_fs;
4880             goto fs_set;
4881         }
4882         awk_split = Qtrue;
4883     }
4884     else {
4885       fs_set:
4886         if (TYPE(spat) == T_STRING) {
4887             rb_encoding *enc2 = STR_ENC_GET(spat);
4888
4889             if (rb_enc_mbminlen(enc2) == 1) {
4890                 if (RSTRING_LEN(spat) == 1 && RSTRING_PTR(spat)[0] == ' '){
4891                     awk_split = Qtrue;
4892                 }
4893             }
4894             else {
4895                 int l;
4896                 if (rb_enc_ascget(RSTRING_PTR(spat), RSTRING_END(spat), &l, enc2) == ' ' &&
4897                     RSTRING_LEN(spat) == l) {
4898                     awk_split = Qtrue;
4899                 }
4900             }
4901             if (!awk_split) {
4902                 spat = rb_reg_regcomp(rb_reg_quote(spat));
4903             }
4904         }
4905         else {
4906             spat = get_pat(spat, 1);
4907         }
4908     }
4909
4910     result = rb_ary_new();
4911     beg = 0;
4912     if (awk_split) {
4913         char *ptr = RSTRING_PTR(str);
4914         char *eptr = RSTRING_END(str);
4915         char *bptr = ptr;
4916         int skip = 1;
4917         int c;
4918
4919         end = beg;
4920         while (ptr < eptr) {
4921             c = rb_enc_codepoint(ptr, eptr, enc);
4922             ptr += rb_enc_mbclen(ptr, eptr, enc);
4923             if (skip) {
4924                 if (rb_enc_isspace(c, enc)) {
4925                     beg = ptr - bptr;
4926                 }
4927                 else {
4928                     end = ptr - bptr;
4929                     skip = 0;
4930                     if (!NIL_P(limit) && lim <= i) break;
4931                 }
4932             }
4933             else {
4934                 if (rb_enc_isspace(c, enc)) {
4935                     rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
4936                     skip = 1;
4937                     beg = ptr - bptr;
4938                     if (!NIL_P(limit)) ++i;
4939                 }
4940                 else {
4941                     end = ptr - bptr;
4942                 }
4943             }
4944         }
4945     }
4946     else {
4947         long start = beg;
4948         long idx;
4949         int last_null = 0;
4950         struct re_registers *regs;
4951
4952         while ((end = rb_reg_search(spat, str, start, 0)) >= 0) {
4953             regs = RMATCH_REGS(rb_backref_get());
4954             if (start == end && BEG(0) == END(0)) {
4955                 if (!RSTRING_PTR(str)) {
4956                     rb_ary_push(result, rb_str_new("", 0));
4957                     break;
4958                 }
4959                 else if (last_null == 1) {
4960                     rb_ary_push(result, rb_str_subseq(str, beg,
4961                                                       rb_enc_mbclen(RSTRING_PTR(str)+beg,
4962                                                                     RSTRING_END(str),
4963                                                                     enc)));
4964                     beg = start;
4965                 }
4966                 else {
4967                     if (RSTRING_PTR(str)+start == RSTRING_END(str))
4968                         start++;
4969                     else
4970                         start += rb_enc_mbclen(RSTRING_PTR(str)+start,RSTRING_END(str),enc);
4971                     last_null = 1;
4972                     continue;
4973                 }
4974             }
4975             else {
4976                 rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
4977                 beg = start = END(0);
4978             }
4979             last_null = 0;
4980
4981             for (idx=1; idx < regs->num_regs; idx++) {
4982                 if (BEG(idx) == -1) continue;
4983                 if (BEG(idx) == END(idx))
4984                     tmp = rb_str_new5(str, 0, 0);
4985                 else
4986                     tmp = rb_str_subseq(str, BEG(idx), END(idx)-BEG(idx));
4987                 rb_ary_push(result, tmp);
4988             }
4989             if (!NIL_P(limit) && lim <= ++i) break;
4990         }
4991     }
4992     if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
4993         if (RSTRING_LEN(str) == beg)
4994             tmp = rb_str_new5(str, 0, 0);
4995         else
4996             tmp = rb_str_subseq(str, beg, RSTRING_LEN(str)-beg);
4997         rb_ary_push(result, tmp);
4998     }
4999     if (NIL_P(limit) && lim == 0) {
5000         while (RARRAY_LEN(result) > 0 &&
5001                RSTRING_LEN(RARRAY_PTR(result)[RARRAY_LEN(result)-1]) == 0)
5002             rb_ary_pop(result);
5003     }
5004
5005     return result;
5006 }
5007
5008 VALUE
5009 rb_str_split(VALUE str, const char *sep0)
5010 {
5011     VALUE sep;
5012
5013     StringValue(str);
5014     sep = rb_str_new2(sep0);
5015     return rb_str_split_m(1, &sep, str);
5016 }
5017
5018
5019 /*
5020  *  Document-method: lines
5021  *  call-seq:
5022  *     str.lines(separator=$/)   => anEnumerator
5023  *     str.lines(separator=$/) {|substr| block }        => str
5024  *
5025  *  Returns an enumerator that gives each line in the string.  If a block is
5026  *  given, it iterates over each line in the string.
5027  *
5028  *     "foo\nbar\n".lines.to_a   #=> ["foo\n", "bar\n"]
5029  *     "foo\nb ar".lines.sort    #=> ["b ar", "foo\n"]
5030  */
5031
5032 /*
5033  *  Document-method: each_line
5034  *  call-seq:
5035  *     str.each_line(separator=$/) {|substr| block }   => str
5036  *
5037  *  Splits <i>str</i> using the supplied parameter as the record separator
5038  *  (<code>$/</code> by default), passing each substring in turn to the supplied
5039  *  block. If a zero-length record separator is supplied, the string is split
5040  *  into paragraphs delimited by multiple successive newlines.
5041  *
5042  *     print "Example one\n"
5043  *     "hello\nworld".each {|s| p s}
5044  *     print "Example two\n"
5045  *     "hello\nworld".each('l') {|s| p s}
5046  *     print "Example three\n"
5047  *     "hello\n\n\nworld".each('') {|s| p s}
5048  *
5049  *  <em>produces:</em>
5050  *
5051  *     Example one
5052  *     "hello\n"
5053  *     "world"
5054  *     Example two
5055  *     "hel"
5056  *     "l"
5057  *     "o\nworl"
5058  *     "d"
5059  *     Example three
5060  *     "hello\n\n\n"
5061  *     "world"
5062  */
5063
5064 static VALUE
5065 rb_str_each_line(int argc, VALUE *argv, VALUE str)
5066 {
5067     rb_encoding *enc;
5068     VALUE rs;
5069     int newline;
5070     char *p, *pend, *s, *ptr;
5071     long len, rslen;
5072     VALUE line;
5073     int n;
5074     VALUE orig = str;
5075
5076     if (argc == 0) {
5077         rs = rb_rs;
5078     }
5079     else {
5080         rb_scan_args(argc, argv, "01", &rs);
5081     }
5082     RETURN_ENUMERATOR(str, argc, argv);
5083     if (NIL_P(rs)) {
5084         rb_yield(str);
5085         return orig;
5086     }
5087     str = rb_str_new4(str);
5088     ptr = p = s = RSTRING_PTR(str);
5089     pend = p + RSTRING_LEN(str);
5090     len = RSTRING_LEN(str);
5091     StringValue(rs);
5092     if (rs == rb_default_rs) {
5093         enc = rb_enc_get(str);
5094         while (p < pend) {
5095             char *p0;
5096
5097             p = memchr(p, '\n', pend - p);
5098             if (!p) break;
5099             p0 = rb_enc_left_char_head(s, p, enc);
5100             if (!rb_enc_is_newline(p0, pend, enc)) {
5101                 p++;
5102                 continue;
5103             }
5104             p = p0 + rb_enc_mbclen(p0, pend, enc);
5105             line = rb_str_new5(str, s, p - s);
5106             OBJ_INFECT(line, str);
5107             rb_enc_cr_str_copy_for_substr(line, str);
5108             rb_yield(line);
5109             str_mod_check(str, ptr, len);
5110             s = p;
5111         }
5112         goto finish;
5113     }
5114
5115     enc = rb_enc_check(str, rs);
5116     rslen = RSTRING_LEN(rs);
5117     if (rslen == 0) {
5118         newline = '\n';
5119     }
5120     else {
5121         newline = rb_enc_codepoint(RSTRING_PTR(rs), RSTRING_END(rs), enc);
5122     }
5123
5124     while (p < pend) {
5125         int c = rb_enc_codepoint(p, pend, enc);
5126
5127       again:
5128         n = rb_enc_codelen(c, enc);
5129         if (rslen == 0 && c == newline) {
5130             p += n;
5131             if (p < pend && (c = rb_enc_codepoint(p, pend, enc)) != newline) {
5132                 goto again;
5133             }
5134             while (p < pend && rb_enc_codepoint(p, pend, enc) == newline) {
5135                 p += n;
5136             }
5137             p -= n;
5138         }
5139         if (c == newline &&
5140             (rslen <= 1 || memcmp(RSTRING_PTR(rs), p, rslen) == 0)) {
5141             line = rb_str_new5(str, s, p - s + (rslen ? rslen : n));
5142             OBJ_INFECT(line, str);
5143             rb_enc_cr_str_copy_for_substr(line, str);
5144             rb_yield(line);
5145             str_mod_check(str, ptr, len);
5146             s = p + (rslen ? rslen : n);
5147         }
5148         p += n;
5149     }
5150
5151   finish:
5152     if (s != pend) {
5153         line = rb_str_new5(str, s, pend - s);
5154         OBJ_INFECT(line, str);
5155         rb_enc_cr_str_copy_for_substr(line, str);
5156         rb_yield(line);
5157     }
5158
5159     return orig;
5160 }
5161
5162
5163 /*
5164  *  Document-method: bytes
5165  *  call-seq:
5166  *     str.bytes   => anEnumerator
5167  *     str.bytes {|fixnum| block }    => str
5168  *
5169  *  Returns an enumerator that gives each byte in the string.  If a block is
5170  *  given, it iterates over each byte in the string.
5171  *
5172  *     "hello".bytes.to_a        #=> [104, 101, 108, 108, 111]
5173  */
5174
5175 /*
5176  *  Document-method: each_byte
5177  *  call-seq:
5178  *     str.each_byte {|fixnum| block }    => str
5179  *
5180  *  Passes each byte in <i>str</i> to the given block.
5181  *
5182  *     "hello".each_byte {|c| print c, ' ' }
5183  *
5184  *  <em>produces:</em>
5185  *
5186  *     104 101 108 108 111
5187  */
5188
5189 static VALUE
5190 rb_str_each_byte(VALUE str)
5191 {
5192     long i;
5193
5194     RETURN_ENUMERATOR(str, 0, 0);
5195     for (i=0; i<RSTRING_LEN(str); i++) {
5196         rb_yield(INT2FIX(RSTRING_PTR(str)[i] & 0xff));
5197     }
5198     return str;
5199 }
5200
5201
5202 /*
5203  *  Document-method: chars
5204  *  call-seq:
5205  *     str.chars                   => anEnumerator
5206  *     str.chars {|substr| block } => str
5207  *
5208  *  Returns an enumerator that gives each character in the string.
5209  *  If a block is given, it iterates over each character in the string.
5210  *
5211  *     "foo".chars.to_a   #=> ["f","o","o"]
5212  */
5213
5214 /*
5215  *  Document-method: each_char
5216  *  call-seq:
5217  *     str.each_char {|cstr| block }    => str
5218  *
5219  *  Passes each character in <i>str</i> to the given block.
5220  *
5221  *     "hello".each_char {|c| print c, ' ' }
5222  *
5223  *  <em>produces:</em>
5224  *
5225  *     h e l l o
5226  */
5227
5228 static VALUE
5229 rb_str_each_char(VALUE str)
5230 {
5231     int i, len, n;
5232     const char *ptr;
5233     rb_encoding *enc;
5234
5235     RETURN_ENUMERATOR(str, 0, 0);
5236     str = rb_str_new4(str);
5237     ptr = RSTRING_PTR(str);
5238     len = RSTRING_LEN(str);
5239     enc = rb_enc_get(str);
5240     for (i = 0; i < len; i += n) {
5241         n = rb_enc_mbclen(ptr + i, ptr + len, enc);
5242         rb_yield(rb_str_subseq(str, i, n));
5243     }
5244     return str;
5245 }
5246
5247 static long
5248 chopped_length(VALUE str)
5249 {
5250     rb_encoding *enc = STR_ENC_GET(str);
5251     const char *p, *p2, *beg, *end;
5252
5253     beg = RSTRING_PTR(str);
5254     end = beg + RSTRING_LEN(str);
5255     if (beg > end) return 0;
5256     p = rb_enc_prev_char(beg, end, enc);
5257     if (!p) return 0;
5258     if (p > beg && rb_enc_codepoint(p, end, enc) == '\n') {
5259         p2 = rb_enc_prev_char(beg, p, enc);
5260         if (p2 && rb_enc_codepoint(p2, end, enc) == '\r') p = p2;
5261     }
5262     return p - beg;
5263 }
5264
5265 /*
5266  *  call-seq:
5267  *     str.chop!   => str or nil
5268  *
5269  *  Processes <i>str</i> as for <code>String#chop</code>, returning <i>str</i>,
5270  *  or <code>nil</code> if <i>str</i> is the empty string.  See also
5271  *  <code>String#chomp!</code>.
5272  */
5273
5274 static VALUE
5275 rb_str_chop_bang(VALUE str)
5276 {
5277     if (RSTRING_LEN(str) > 0) {
5278         long len;
5279         rb_str_modify(str);
5280         len = chopped_length(str);
5281         STR_SET_LEN(str, len);
5282         RSTRING_PTR(str)[len] = '\0';
5283         return str;
5284     }
5285     return Qnil;
5286 }
5287
5288
5289 /*
5290  *  call-seq:
5291  *     str.chop   => new_str
5292  *
5293  *  Returns a new <code>String</code> with the last character removed.  If the
5294  *  string ends with <code>\r\n</code>, both characters are removed. Applying
5295  *  <code>chop</code> to an empty string returns an empty
5296  *  string. <code>String#chomp</code> is often a safer alternative, as it leaves
5297  *  the string unchanged if it doesn't end in a record separator.
5298  *
5299  *     "string\r\n".chop   #=> "string"
5300  *     "string\n\r".chop   #=> "string\n"
5301  *     "string\n".chop     #=> "string"
5302  *     "string".chop       #=> "strin"
5303  *     "x".chop.chop       #=> ""
5304  */
5305
5306 static VALUE
5307 rb_str_chop(VALUE str)
5308 {
5309     VALUE str2 = rb_str_new5(str, RSTRING_PTR(str), chopped_length(str));
5310     rb_enc_cr_str_copy_for_substr(str2, str);
5311     OBJ_INFECT(str2, str);
5312     return str2;
5313 }
5314
5315
5316 /*
5317  *  call-seq:
5318  *     str.chomp!(separator=$/)   => str or nil
5319  *
5320  *  Modifies <i>str</i> in place as described for <code>String#chomp</code>,
5321  *  returning <i>str</i>, or <code>nil</code> if no modifications were made.
5322  */
5323
5324 static VALUE
5325 rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
5326 {
5327     rb_encoding *enc;
5328     VALUE rs;
5329     int newline;
5330     char *p, *pp, *e;
5331     long len, rslen;
5332
5333     len = RSTRING_LEN(str);
5334     if (len == 0) return Qnil;
5335     p = RSTRING_PTR(str);
5336     e = p + len;
5337     if (argc == 0) {
5338         rs = rb_rs;
5339         if (rs == rb_default_rs) {
5340           smart_chomp:
5341             rb_str_modify(str);
5342             enc = rb_enc_get(str);
5343             if (rb_enc_mbminlen(enc) > 1) {
5344                 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), enc);
5345                 if (rb_enc_is_newline(pp, e, enc)) {
5346                     e = pp;
5347                 }
5348                 pp = e - rb_enc_mbminlen(enc);
5349                 if (pp >= p) {
5350                     pp = rb_enc_left_char_head(p, pp, enc);
5351                     if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
5352                         e = pp;
5353                     }
5354                 }
5355                 if (e == RSTRING_END(str)) {
5356                     return Qnil;
5357                 }
5358                 len = e - RSTRING_PTR(str);
5359                 STR_SET_LEN(str, len);
5360             }
5361             else {
5362                 if (RSTRING_PTR(str)[len-1] == '\n') {
5363                     STR_DEC_LEN(str);
5364                     if (RSTRING_LEN(str) > 0 &&
5365                         RSTRING_PTR(str)[RSTRING_LEN(str)-1] == '\r') {
5366                         STR_DEC_LEN(str);
5367                     }
5368                 }
5369                 else if (RSTRING_PTR(str)[len-1] == '\r') {
5370                     STR_DEC_LEN(str);
5371                 }
5372                 else {
5373                     return Qnil;
5374                 }
5375             }
5376             RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
5377             return str;
5378         }
5379     }
5380     else {
5381         rb_scan_args(argc, argv, "01", &rs);
5382     }
5383     if (NIL_P(rs)) return Qnil;
5384     StringValue(rs);
5385     rslen = RSTRING_LEN(rs);
5386     if (rslen == 0) {
5387         while (len>0 && p[len-1] == '\n') {
5388             len--;
5389             if (len>0 && p[len-1] == '\r')
5390                 len--;
5391         }
5392         if (len < RSTRING_LEN(str)) {
5393             rb_str_modify(str);
5394             STR_SET_LEN(str, len);
5395             RSTRING_PTR(str)[len] = '\0';
5396             return str;
5397         }
5398         return Qnil;
5399     }
5400     if (rslen > len) return Qnil;
5401     newline = RSTRING_PTR(rs)[rslen-1];
5402     if (rslen == 1 && newline == '\n')
5403         goto smart_chomp;
5404
5405     enc = rb_enc_check(str, rs);
5406     if (is_broken_string(rs)) {
5407         return Qnil;
5408     }
5409     pp = e - rslen;
5410     if (p[len-1] == newline &&
5411         (rslen <= 1 ||
5412          memcmp(RSTRING_PTR(rs), pp, rslen) == 0)) {
5413         if (rb_enc_left_char_head(p, pp, enc) != pp)
5414             return Qnil;
5415         rb_str_modify(str);
5416         STR_SET_LEN(str, RSTRING_LEN(str) - rslen);
5417         RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
5418         return str;
5419     }
5420     return Qnil;
5421 }
5422
5423
5424 /*
5425  *  call-seq:
5426  *     str.chomp(separator=$/)   => new_str
5427  *
5428  *  Returns a new <code>String</code> with the given record separator removed
5429  *  from the end of <i>str</i> (if present). If <code>$/</code> has not been
5430  *  changed from the default Ruby record separator, then <code>chomp</code> also
5431  *  removes carriage return characters (that is it will remove <code>\n</code>,
5432  *  <code>\r</code>, and <code>\r\n</code>).
5433  *
5434  *     "hello".chomp            #=> "hello"
5435  *     "hello\n".chomp          #=> "hello"
5436  *     "hello\r\n".chomp        #=> "hello"
5437  *     "hello\n\r".chomp        #=> "hello\n"
5438  *     "hello\r".chomp          #=> "hello"
5439  *     "hello \n there".chomp   #=> "hello \n there"
5440  *     "hello".chomp("llo")     #=> "he"
5441  */
5442
5443 static VALUE
5444 rb_str_chomp(int argc, VALUE *argv, VALUE str)
5445 {
5446     str = rb_str_dup(str);
5447     rb_str_chomp_bang(argc, argv, str);
5448     return str;
5449 }
5450
5451 /*
5452  *  call-seq:
5453  *     str.lstrip!   => self or nil
5454  *
5455  *  Removes leading whitespace from <i>str</i>, returning <code>nil</code> if no
5456  *  change was made. See also <code>String#rstrip!</code> and
5457  *  <code>String#strip!</code>.
5458  *
5459  *     "  hello  ".lstrip   #=> "hello  "
5460  *     "hello".lstrip!      #=> nil
5461  */
5462
5463 static VALUE
5464 rb_str_lstrip_bang(VALUE str)
5465 {
5466     rb_encoding *enc;
5467     char *s, *t, *e;
5468
5469     rb_str_modify(str);
5470     enc = STR_ENC_GET(str);
5471     s = RSTRING_PTR(str);
5472     if (!s || RSTRING_LEN(str) == 0) return Qnil;
5473     e = t = RSTRING_END(str);
5474     /* remove spaces at head */
5475     while (s < e) {
5476         int cc = rb_enc_codepoint(s, e, enc);
5477
5478         if (!rb_enc_isspace(cc, enc)) break;
5479         s += rb_enc_codelen(cc, enc);
5480     }
5481
5482     if (s > RSTRING_PTR(str)) {
5483         rb_str_modify(str);
5484         STR_SET_LEN(str, t-s);
5485         memmove(RSTRING_PTR(str), s, RSTRING_LEN(str));
5486         RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
5487         return str;
5488     }
5489     return Qnil;
5490 }
5491
5492
5493 /*
5494  *  call-seq:
5495  *     str.lstrip   => new_str
5496  *
5497  *  Returns a copy of <i>str</i> with leading whitespace removed. See also
5498  *  <code>String#rstrip</code> and <code>String#strip</code>.
5499  *
5500  *     "  hello  ".lstrip   #=> "hello  "
5501  *     "hello".lstrip       #=> "hello"
5502  */
5503
5504 static VALUE
5505 rb_str_lstrip(VALUE str)
5506 {
5507     str = rb_str_dup(str);
5508     rb_str_lstrip_bang(str);
5509     return str;
5510 }
5511
5512
5513 /*
5514  *  call-seq:
5515  *     str.rstrip!   => self or nil
5516  *
5517  *  Removes trailing whitespace from <i>str</i>, returning <code>nil</code> if
5518  *  no change was made. See also <code>String#lstrip!</code> and
5519  *  <code>String#strip!</code>.
5520  *
5521  *     "  hello  ".rstrip   #=> "  hello"
5522  *     "hello".rstrip!      #=> nil
5523  */
5524
5525 static VALUE
5526 rb_str_rstrip_bang(VALUE str)
5527 {
5528     rb_encoding *enc;
5529     char *s, *t, *e;
5530     int space_seen = Qfalse;
5531
5532     rb_str_modify(str);
5533     enc = STR_ENC_GET(str);
5534     s = RSTRING_PTR(str);
5535     if (!s || RSTRING_LEN(str) == 0) return Qnil;
5536     t = e = RSTRING_END(str);
5537     while (s < e) {
5538         int cc = rb_enc_codepoint(s, e, enc);
5539
5540         if (!cc || rb_enc_isspace(cc, enc)) {
5541             if (!space_seen) t = s;
5542             space_seen = Qtrue;
5543         }
5544         else {
5545             space_seen = Qfalse;
5546         }
5547         s += rb_enc_codelen(cc, enc);
5548     }
5549     if (!space_seen) t = s;
5550     if (t < e) {
5551         rb_str_modify(str);
5552         STR_SET_LEN(str, t-RSTRING_PTR(str));
5553         RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
5554         return str;
5555     }
5556     return Qnil;
5557 }
5558
5559
5560 /*
5561  *  call-seq:
5562  *     str.rstrip   => new_str
5563  *
5564  *  Returns a copy of <i>str</i> with trailing whitespace removed. See also
5565  *  <code>String#lstrip</code> and <code>String#strip</code>.
5566  *
5567  *     "  hello  ".rstrip   #=> "  hello"
5568  *     "hello".rstrip       #=> "hello"
5569  */
5570
5571 static VALUE
5572 rb_str_rstrip(VALUE str)
5573 {
5574     str = rb_str_dup(str);
5575     rb_str_rstrip_bang(str);
5576     return str;
5577 }
5578
5579
5580 /*
5581  *  call-seq:
5582  *     str.strip!   => str or nil
5583  *
5584  *  Removes leading and trailing whitespace from <i>str</i>. Returns
5585  *  <code>nil</code> if <i>str</i> was not altered.
5586  */
5587
5588 static VALUE
5589 rb_str_strip_bang(VALUE str)
5590 {
5591     VALUE l = rb_str_lstrip_bang(str);
5592     VALUE r = rb_str_rstrip_bang(str);
5593
5594     if (NIL_P(l) && NIL_P(r)) return Qnil;
5595     return str;
5596 }
5597
5598
5599 /*
5600  *  call-seq:
5601  *     str.strip   => new_str
5602  *
5603  *  Returns a copy of <i>str</i> with leading and trailing whitespace removed.
5604  *
5605  *     "    hello    ".strip   #=> "hello"
5606  *     "\tgoodbye\r\n".strip   #=> "goodbye"
5607  */
5608
5609 static VALUE
5610 rb_str_strip(VALUE str)
5611 {
5612     str = rb_str_dup(str);
5613     rb_str_strip_bang(str);
5614     return str;
5615 }
5616
5617 static VALUE
5618 scan_once(VALUE str, VALUE pat, long *start)
5619 {
5620     rb_encoding *enc;
5621     VALUE result, match;
5622     struct re_registers *regs;
5623     long i;
5624
5625     enc = STR_ENC_GET(str);
5626     if (rb_reg_search(pat, str, *start, 0) >= 0) {
5627         match = rb_backref_get();
5628         regs = RMATCH_REGS(match);
5629         if (BEG(0) == END(0)) {
5630             /*
5631              * Always consume at least one character of the input string
5632              */
5633             if (RSTRING_LEN(str) > END(0))
5634                 *start = END(0)+rb_enc_mbclen(RSTRING_PTR(str)+END(0),
5635                                               RSTRING_END(str), enc);
5636             else
5637                 *start = END(0)+1;
5638         }
5639         else {
5640             *start = END(0);
5641         }
5642         if (regs->num_regs == 1) {
5643             return rb_reg_nth_match(0, match);
5644         }
5645         result = rb_ary_new2(regs->num_regs);
5646         for (i=1; i < regs->num_regs; i++) {
5647             rb_ary_push(result, rb_reg_nth_match(i, match));
5648         }
5649
5650         return result;
5651     }
5652     return Qnil;
5653 }
5654
5655
5656 /*
5657  *  call-seq:
5658  *     str.scan(pattern)                         => array
5659  *     str.scan(pattern) {|match, ...| block }   => str
5660  *
5661  *  Both forms iterate through <i>str</i>, matching the pattern (which may be a
5662  *  <code>Regexp</code> or a <code>String</code>). For each match, a result is
5663  *  generated and either added to the result array or passed to the block. If
5664  *  the pattern contains no groups, each individual result consists of the
5665  *  matched string, <code>$&</code>.  If the pattern contains groups, each
5666  *  individual result is itself an array containing one entry per group.
5667  *
5668  *     a = "cruel world"
5669  *     a.scan(/\w+/)        #=> ["cruel", "world"]
5670  *     a.scan(/.../)        #=> ["cru", "el ", "wor"]
5671  *     a.scan(/(...)/)      #=> [["cru"], ["el "], ["wor"]]
5672  *     a.scan(/(..)(..)/)   #=> [["cr", "ue"], ["l ", "wo"]]
5673  *
5674  *  And the block form:
5675  *
5676  *     a.scan(/\w+/) {|w| print "<<#{w}>> " }
5677  *     print "\n"
5678  *     a.scan(/(.)(.)/) {|x,y| print y, x }
5679  *     print "\n"
5680  *
5681  *  <em>produces:</em>
5682  *
5683  *     <<cruel>> <<world>>
5684  *     rceu lowlr
5685  */
5686
5687 static VALUE
5688 rb_str_scan(VALUE str, VALUE pat)
5689 {
5690     VALUE result;
5691     long start = 0;
5692     VALUE match = Qnil;
5693     char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
5694
5695     pat = get_pat(pat, 1);
5696     if (!rb_block_given_p()) {
5697         VALUE ary = rb_ary_new();
5698
5699         while (!NIL_P(result = scan_once(str, pat, &start))) {
5700             match = rb_backref_get();
5701             rb_ary_push(ary, result);
5702         }
5703         rb_backref_set(match);
5704         return ary;
5705     }
5706
5707     while (!NIL_P(result = scan_once(str, pat, &start))) {
5708         rb_yield(result);
5709         str_mod_check(str, p, len);
5710     }
5711     rb_backref_set(match);
5712     return str;
5713 }
5714
5715
5716 /*
5717  *  call-seq:
5718  *     str.hex   => integer
5719  *
5720  *  Treats leading characters from <i>str</i> as a string of hexadecimal digits
5721  *  (with an optional sign and an optional <code>0x</code>) and returns the
5722  *  corresponding number. Zero is returned on error.
5723  *
5724  *     "0x0a".hex     #=> 10
5725  *     "-1234".hex    #=> -4660
5726  *     "0".hex        #=> 0
5727  *     "wombat".hex   #=> 0
5728  */
5729
5730 static VALUE
5731 rb_str_hex(VALUE str)
5732 {
5733     rb_encoding *enc = rb_enc_get(str);
5734
5735     if (!rb_enc_asciicompat(enc)) {
5736         rb_raise(rb_eArgError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
5737     }
5738     return rb_str_to_inum(str, 16, Qfalse);
5739 }
5740
5741
5742 /*
5743  *  call-seq:
5744  *     str.oct   => integer
5745  *
5746  *  Treats leading characters of <i>str</i> as a string of octal digits (with an
5747  *  optional sign) and returns the corresponding number.  Returns 0 if the
5748  *  conversion fails.
5749  *
5750  *     "123".oct       #=> 83
5751  *     "-377".oct      #=> -255
5752  *     "bad".oct       #=> 0
5753  *     "0377bad".oct   #=> 255
5754  */
5755
5756 static VALUE
5757 rb_str_oct(VALUE str)
5758 {
5759     rb_encoding *enc = rb_enc_get(str);
5760
5761     if (!rb_enc_asciicompat(enc)) {
5762         rb_raise(rb_eArgError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
5763     }
5764     return rb_str_to_inum(str, -8, Qfalse);
5765 }
5766
5767
5768 /*
5769  *  call-seq:
5770  *     str.crypt(other_str)   => new_str
5771  *
5772  *  Applies a one-way cryptographic hash to <i>str</i> by invoking the standard
5773  *  library function <code>crypt</code>. The argument is the salt string, which
5774  *  should be two characters long, each character drawn from
5775  *  <code>[a-zA-Z0-9./]</code>.
5776  */
5777
5778 static VALUE
5779 rb_str_crypt(VALUE str, VALUE salt)
5780 {
5781     extern char *crypt(const char *, const char *);
5782     VALUE result;
5783     const char *s;
5784
5785     StringValue(salt);
5786     if (RSTRING_LEN(salt) < 2)
5787         rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
5788
5789     if (RSTRING_PTR(str)) s = RSTRING_PTR(str);
5790     else s = "";
5791     result = rb_str_new2(crypt(s, RSTRING_PTR(salt)));
5792     OBJ_INFECT(result, str);
5793     OBJ_INFECT(result, salt);
5794     return result;
5795 }
5796
5797
5798 /*
5799  *  call-seq:
5800  *     str.intern   => symbol
5801  *     str.to_sym   => symbol
5802  *
5803  *  Returns the <code>Symbol</code> corresponding to <i>str</i>, creating the
5804  *  symbol if it did not previously exist. See <code>Symbol#id2name</code>.
5805  *
5806  *     "Koala".intern         #=> :Koala
5807  *     s = 'cat'.to_sym       #=> :cat
5808  *     s == :cat              #=> true
5809  *     s = '@cat'.to_sym      #=> :@cat
5810  *     s == :@cat             #=> true
5811  *
5812  *  This can also be used to create symbols that cannot be represented using the
5813  *  <code>:xxx</code> notation.
5814  *
5815  *     'cat and dog'.to_sym   #=> :"cat and dog"
5816  */
5817
5818 VALUE
5819 rb_str_intern(VALUE s)
5820 {
5821     VALUE str = RB_GC_GUARD(s);
5822     ID id;
5823
5824     id = rb_intern_str(str);
5825     return ID2SYM(id);
5826 }
5827
5828
5829 /*
5830  *  call-seq:
5831  *     str.ord   => integer
5832  *
5833  *  Return the <code>Integer</code> ordinal of a one-character string.
5834  *
5835  *     "a".ord         #=> 97
5836  */
5837
5838 VALUE
5839 rb_str_ord(VALUE s)
5840 {
5841     int c;
5842
5843     c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s));
5844     return INT2NUM(c);
5845 }
5846 /*
5847  *  call-seq:
5848  *     str.sum(n=16)   => integer
5849  *
5850  *  Returns a basic <em>n</em>-bit checksum of the characters in <i>str</i>,
5851  *  where <em>n</em> is the optional <code>Fixnum</code> parameter, defaulting
5852  *  to 16. The result is simply the sum of the binary value of each character in
5853  *  <i>str</i> modulo <code>2n - 1</code>. This is not a particularly good
5854  *  checksum.
5855  */
5856
5857 static VALUE
5858 rb_str_sum(int argc, VALUE *argv, VALUE str)
5859 {
5860     VALUE vbits;
5861     int bits;
5862     char *ptr, *p, *pend;
5863     long len;
5864
5865     if (argc == 0) {
5866         bits = 16;
5867     }
5868     else {
5869         rb_scan_args(argc, argv, "01", &vbits);
5870         bits = NUM2INT(vbits);
5871     }
5872     ptr = p = RSTRING_PTR(str);
5873     len = RSTRING_LEN(str);
5874     pend = p + len;
5875     if (bits >= sizeof(long)*CHAR_BIT) {
5876         VALUE sum = INT2FIX(0);
5877
5878         while (p < pend) {
5879             str_mod_check(str, ptr, len);
5880             sum = rb_funcall(sum, '+', 1, INT2FIX((unsigned char)*p));
5881             p++;
5882         }
5883         if (bits != 0) {
5884             VALUE mod;
5885
5886             mod = rb_funcall(INT2FIX(1), rb_intern("<<"), 1, INT2FIX(bits));
5887             mod = rb_funcall(mod, '-', 1, INT2FIX(1));
5888             sum = rb_funcall(sum, '&', 1, mod);
5889         }
5890         return sum;
5891     }
5892     else {
5893        unsigned long sum = 0;
5894
5895         while (p < pend) {
5896             str_mod_check(str, ptr, len);
5897             sum += (unsigned char)*p;
5898             p++;
5899         }
5900         if (bits != 0) {
5901            sum &= (((unsigned long)1)<<bits)-1;
5902         }
5903         return rb_int2inum(sum);
5904     }
5905 }
5906
5907 static VALUE
5908 rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
5909 {
5910     rb_encoding *enc;
5911     VALUE w;
5912     long width, len, flen = 1, fclen = 1;
5913     VALUE res;
5914     char *p;
5915     const char *f = " ";
5916     long n, llen, rlen;
5917     volatile VALUE pad;
5918     int singlebyte = 1;
5919
5920     rb_scan_args(argc, argv, "11", &w, &pad);
5921     enc = STR_ENC_GET(str);
5922     width = NUM2LONG(w);
5923     if (argc == 2) {
5924         StringValue(pad);
5925         enc = rb_enc_check(str, pad);
5926         f = RSTRING_PTR(pad);
5927         flen = RSTRING_LEN(pad);
5928         fclen = str_strlen(pad, enc);
5929         singlebyte = single_byte_optimizable(pad);
5930         if (flen == 0 || fclen == 0) {
5931             rb_raise(rb_eArgError, "zero width padding");
5932         }
5933     }
5934     len = str_strlen(str, enc);
5935     if (width < 0 || len >= width) return rb_str_dup(str);
5936     n = width - len;
5937     llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
5938     rlen = n - llen;
5939     res = rb_str_new5(str, 0, RSTRING_LEN(str)+n*flen/fclen+2);
5940     p = RSTRING_PTR(res);
5941     while (llen) {
5942         if (flen <= 1) {
5943             *p++ = *f;
5944             llen--;
5945         }
5946         else if (llen > fclen) {
5947             memcpy(p,f,flen);
5948             p += flen;
5949             llen -= fclen;
5950         }
5951         else {
5952             char *fp = str_nth(f, f+flen, llen, enc, singlebyte);
5953             n = fp - f;
5954             memcpy(p,f,n);
5955             p+=n;
5956             break;
5957         }
5958     }
5959     memcpy(p, RSTRING_PTR(str), RSTRING_LEN(str));
5960     p+=RSTRING_LEN(str);
5961     while (rlen) {
5962         if (flen <= 1) {
5963             *p++ = *f;
5964             rlen--;
5965         }
5966         else if (rlen > fclen) {
5967             memcpy(p,f,flen);
5968             p += flen;
5969             rlen -= fclen;
5970         }
5971         else {
5972             char *fp = str_nth(f, f+flen, rlen, enc, singlebyte);
5973             n = fp - f;
5974             memcpy(p,f,n);
5975             p+=n;
5976             break;
5977         }
5978     }
5979     *p = '\0';
5980     STR_SET_LEN(res, p-RSTRING_PTR(res));
5981     OBJ_INFECT(res, str);
5982     if (!NIL_P(pad)) OBJ_INFECT(res, pad);
5983     rb_enc_associate(res, enc);
5984     return res;
5985 }
5986
5987
5988 /*
5989  *  call-seq:
5990  *     str.ljust(integer, padstr=' ')   => new_str
5991  *
5992  *  If <i>integer</i> is greater than the length of <i>str</i>, returns a new
5993  *  <code>String</code> of length <i>integer</i> with <i>str</i> left justified
5994  *  and padded with <i>padstr</i>; otherwise, returns <i>str</i>.
5995  *
5996  *     "hello".ljust(4)            #=> "hello"
5997  *     "hello".ljust(20)           #=> "hello               "
5998  *     "hello".ljust(20, '1234')   #=> "hello123412341234123"
5999  */
6000
6001 static VALUE
6002 rb_str_ljust(int argc, VALUE *argv, VALUE str)
6003 {
6004     return rb_str_justify(argc, argv, str, 'l');
6005 }
6006
6007
6008 /*
6009  *  call-seq:
6010  *     str.rjust(integer, padstr=' ')   => new_str
6011  *
6012  *  If <i>integer</i> is greater than the length of <i>str</i>, returns a new
6013  *  <code>String</code> of length <i>integer</i> with <i>str</i> right justified
6014  *  and padded with <i>padstr</i>; otherwise, returns <i>str</i>.
6015  *
6016  *     "hello".rjust(4)            #=> "hello"
6017  *     "hello".rjust(20)           #=> "               hello"
6018  *     "hello".rjust(20, '1234')   #=> "123412341234123hello"
6019  */
6020
6021 static VALUE
6022 rb_str_rjust(int argc, VALUE *argv, VALUE str)
6023 {
6024     return rb_str_justify(argc, argv, str, 'r');
6025 }
6026
6027
6028 /*
6029  *  call-seq:
6030  *     str.center(integer, padstr)   => new_str
6031  *
6032  *  If <i>integer</i> is greater than the length of <i>str</i>, returns a new
6033  *  <code>String</code> of length <i>integer</i> with <i>str</i> centered and
6034  *  padded with <i>padstr</i>; otherwise, returns <i>str</i>.
6035  *
6036  *     "hello".center(4)         #=> "hello"
6037  *     "hello".center(20)        #=> "       hello        "
6038  *     "hello".center(20, '123') #=> "1231231hello12312312"
6039  */
6040
6041 static VALUE
6042 rb_str_center(int argc, VALUE *argv, VALUE str)
6043 {
6044     return rb_str_justify(argc, argv, str, 'c');
6045 }
6046
6047 /*
6048  *  call-seq:
6049  *     str.partition(sep)              => [head, sep, tail]
6050  *
6051  *  Searches the string for <i>sep</i> and returns the part before
6052  *  it, the <i>sep</i>, and the part after it.  If <i>sep</i> is not found,
6053  *  returns <i>str</i> and two empty strings.
6054  *
6055  *     "hello".partition("l")         #=> ["he", "l", "lo"]
6056  *     "hello".partition("x")         #=> ["hello", "", ""]
6057  */
6058
6059 static VALUE
6060 rb_str_partition(VALUE str, VALUE sep)
6061 {
6062     long pos;
6063     int regex = Qfalse;
6064
6065     if (TYPE(sep) == T_REGEXP) {
6066         pos = rb_reg_search(sep, str, 0, 0);
6067         regex = Qtrue;
6068     }
6069     else {
6070         VALUE tmp;
6071
6072         tmp = rb_check_string_type(sep);
6073         if (NIL_P(tmp)) {
6074             rb_raise(rb_eTypeError, "type mismatch: %s given",
6075                      rb_obj_classname(sep));
6076         }
6077         pos = rb_str_index(str, sep, 0);
6078     }
6079     if (pos < 0) {
6080       failed:
6081         return rb_ary_new3(3, str, rb_str_new(0,0),rb_str_new(0,0));
6082     }
6083     if (regex) {
6084         sep = rb_str_subpat(str, sep, 0);
6085         if (pos == 0 && RSTRING_LEN(sep) == 0) goto failed;
6086     }
6087     return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
6088                           sep,
6089                           rb_str_subseq(str, pos+RSTRING_LEN(sep),
6090                                              RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
6091 }
6092
6093 /*
6094  *  call-seq:
6095  *     str.rpartition(sep)            => [head, sep, tail]
6096  *
6097  *  Searches <i>sep</i> in the string from the end of the string, and
6098  *  returns the part before it, the <i>sep</i>, and the part after it.
6099  *  If <i>sep</i> is not found, returns two empty strings and
6100  *  <i>str</i>.
6101  *
6102  *     "hello".rpartition("l")         #=> ["hel", "l", "o"]
6103  *     "hello".rpartition("x")         #=> ["", "", "hello"]
6104  */
6105
6106 static VALUE
6107 rb_str_rpartition(VALUE str, VALUE sep)
6108 {
6109     long pos = RSTRING_LEN(str);
6110     int regex = Qfalse;
6111
6112     if (TYPE(sep) == T_REGEXP) {
6113         pos = rb_reg_search(sep, str, pos, 1);
6114         regex = Qtrue;
6115     }
6116     else {
6117         VALUE tmp;
6118
6119         tmp = rb_check_string_type(sep);
6120         if (NIL_P(tmp)) {
6121             rb_raise(rb_eTypeError, "type mismatch: %s given",
6122                      rb_obj_classname(sep));
6123         }
6124         pos = rb_str_sublen(str, pos);
6125         pos = rb_str_rindex(str, sep, pos);
6126     }
6127     if (pos < 0) {
6128         return rb_ary_new3(3, rb_str_new(0,0),rb_str_new(0,0), str);
6129     }
6130     if (regex) {
6131         sep = rb_reg_nth_match(0, rb_backref_get());
6132     }
6133     return rb_ary_new3(3, rb_str_substr(str, 0, pos),
6134                           sep,
6135                           rb_str_substr(str,pos+str_strlen(sep,STR_ENC_GET(sep)),RSTRING_LEN(str)));
6136 }
6137
6138 /*
6139  *  call-seq:
6140  *     str.start_with?([prefix]+)   => true or false
6141  *
6142  *  Returns true if <i>str</i> starts with the prefix given.
6143  */
6144
6145 static VALUE
6146 rb_str_start_with(int argc, VALUE *argv, VALUE str)
6147 {
6148     int i;
6149
6150     for (i=0; i<argc; i++) {
6151         VALUE tmp = rb_check_string_type(argv[i]);
6152         if (NIL_P(tmp)) continue;
6153         rb_enc_check(str, tmp);
6154         if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
6155         if (memcmp(RSTRING_PTR(str), RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
6156             return Qtrue;
6157     }
6158     return Qfalse;
6159 }
6160
6161 /*
6162  *  call-seq:
6163  *     str.end_with?([suffix]+)   => true or false
6164  *
6165  *  Returns true if <i>str</i> ends with the suffix given.
6166  */
6167
6168 static VALUE
6169 rb_str_end_with(int argc, VALUE *argv, VALUE str)
6170 {
6171     int i;
6172     char *p, *s;
6173     rb_encoding *enc;
6174
6175     for (i=0; i<argc; i++) {
6176         VALUE tmp = rb_check_string_type(argv[i]);
6177         if (NIL_P(tmp)) continue;
6178         enc = rb_enc_check(str, tmp);
6179         if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
6180         p = RSTRING_PTR(str);
6181         s = p + RSTRING_LEN(str) - RSTRING_LEN(tmp);
6182         if (rb_enc_left_char_head(p, s, enc) != s)
6183             continue;
6184         if (memcmp(s, RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
6185             return Qtrue;
6186     }
6187     return Qfalse;
6188 }
6189
6190 void
6191 rb_str_setter(VALUE val, ID id, VALUE *var)
6192 {
6193     if (!NIL_P(val) && TYPE(val) != T_STRING) {
6194         rb_raise(rb_eTypeError, "value of %s must be String", rb_id2name(id));
6195     }
6196     *var = val;
6197 }
6198
6199
6200 /*
6201  *  call-seq:
6202  *     str.force_encoding(encoding)   => str
6203  *
6204  *  Changes the encoding to +encoding+ and returns self.
6205  */
6206
6207 static VALUE
6208 rb_str_force_encoding(VALUE str, VALUE enc)
6209 {
6210     str_modifiable(str);
6211     rb_enc_associate(str, rb_to_encoding(enc));
6212     return str;
6213 }
6214
6215 /*
6216  *  call-seq:
6217  *     str.valid_encoding?  => true or false
6218  *
6219  *  Returns true for a string which encoded correctly.
6220  *
6221  *    "\xc2\xa1".force_encoding("UTF-8").valid_encoding? => true
6222  *    "\xc2".force_encoding("UTF-8").valid_encoding? => false
6223  *    "\x80".force_encoding("UTF-8").valid_encoding? => false
6224  */
6225
6226 static VALUE
6227 rb_str_valid_encoding_p(VALUE str)
6228 {
6229     int cr = rb_enc_str_coderange(str);
6230
6231     return cr == ENC_CODERANGE_BROKEN ? Qfalse : Qtrue;
6232 }
6233
6234 /*
6235  *  call-seq:
6236  *     str.ascii_only?  => true or false
6237  *
6238  *  Returns true for a string which has only ASCII characters.
6239  *
6240  *    "abc".force_encoding("UTF-8").ascii_only? => true
6241  *    "abc\u{6666}".force_encoding("UTF-8").ascii_only? => false
6242  */
6243
6244 static VALUE
6245 rb_str_is_ascii_only_p(VALUE str)
6246 {
6247     int cr = rb_enc_str_coderange(str);
6248
6249     return cr == ENC_CODERANGE_7BIT ? Qtrue : Qfalse;
6250 }
6251
6252 /**********************************************************************
6253  * Document-class: Symbol
6254  *
6255  *  <code>Symbol</code> objects represent names and some strings
6256  *  inside the Ruby
6257  *  interpreter. They are generated using the <code>:name</code> and
6258  *  <code>:"string"</code> literals
6259  *  syntax, and by the various <code>to_sym</code> methods. The same
6260  *  <code>Symbol</code> object will be created for a given name or string
6261  *  for the duration of a program's execution, regardless of the context
6262  *  or meaning of that name. Thus if <code>Fred</code> is a constant in
6263  *  one context, a method in another, and a class in a third, the
6264  *  <code>Symbol</code> <code>:Fred</code> will be the same object in
6265  *  all three contexts.
6266  *
6267  *     module One
6268  *       class Fred
6269  *       end
6270  *       $f1 = :Fred
6271  *     end
6272  *     module Two
6273  *       Fred = 1
6274  *       $f2 = :Fred
6275  *     end
6276  *     def Fred()
6277  *     end
6278  *     $f3 = :Fred
6279  *     $f1.object_id   #=> 2514190
6280  *     $f2.object_id   #=> 2514190
6281  *     $f3.object_id   #=> 2514190
6282  *
6283  */
6284
6285
6286 /*
6287  *  call-seq:
6288  *     sym == obj   => true or false
6289  *
6290  *  Equality---If <i>sym</i> and <i>obj</i> are exactly the same
6291  *  symbol, returns <code>true</code>. Otherwise, compares them
6292  *  as strings.
6293  */
6294
6295 static VALUE
6296 sym_equal(VALUE sym1, VALUE sym2)
6297 {
6298     if (sym1 == sym2) return Qtrue;
6299     return Qfalse;
6300 }
6301
6302
6303 /*
6304  *  call-seq:
6305  *     sym.inspect    => string
6306  *
6307  *  Returns the representation of <i>sym</i> as a symbol literal.
6308  *
6309  *     :fred.inspect   #=> ":fred"
6310  */
6311
6312 static VALUE
6313 sym_inspect(VALUE sym)
6314 {
6315     VALUE str, klass = Qundef;
6316     ID id = SYM2ID(sym);
6317     rb_encoding *enc;
6318
6319     sym = rb_id2str(id);
6320     enc = STR_ENC_GET(sym);
6321     str = rb_enc_str_new(0, RSTRING_LEN(sym)+1, enc);
6322     RSTRING_PTR(str)[0] = ':';
6323     memcpy(RSTRING_PTR(str)+1, RSTRING_PTR(sym), RSTRING_LEN(sym));
6324     if (RSTRING_LEN(sym) != strlen(RSTRING_PTR(sym)) ||
6325         !rb_enc_symname_p(RSTRING_PTR(sym), enc)) {
6326         str = rb_str_inspect(str);
6327         strncpy(RSTRING_PTR(str), ":\"", 2);
6328     }
6329     if (klass != Qundef) {
6330         rb_str_cat2(str, "/");
6331         rb_str_append(str, rb_inspect(klass));
6332     }
6333     return str;
6334 }
6335
6336
6337 /*
6338  *  call-seq:
6339  *     sym.id2name   => string
6340  *     sym.to_s      => string
6341  *
6342  *  Returns the name or string corresponding to <i>sym</i>.
6343  *
6344  *     :fred.id2name   #=> "fred"
6345  */
6346
6347
6348 VALUE
6349 rb_sym_to_s(VALUE sym)
6350 {
6351     ID id = SYM2ID(sym);
6352
6353     return str_new3(rb_cString, rb_id2str(id));
6354 }
6355
6356
6357 /*
6358  * call-seq:
6359  *   sym.to_sym   => sym
6360  *   sym.intern   => sym
6361  *
6362  * In general, <code>to_sym</code> returns the <code>Symbol</code> corresponding
6363  * to an object. As <i>sym</i> is already a symbol, <code>self</code> is returned
6364  * in this case.
6365  */
6366
6367 static VALUE
6368 sym_to_sym(VALUE sym)
6369 {
6370     return sym;
6371 }
6372
6373 static VALUE
6374 sym_call(VALUE args, VALUE sym, int argc, VALUE *argv)
6375 {
6376     VALUE obj;
6377
6378     if (argc < 1) {
6379         rb_raise(rb_eArgError, "no receiver given");
6380     }
6381     obj = argv[0];
6382     return rb_funcall3(obj, (ID)sym, argc - 1, argv + 1);
6383 }
6384
6385 /*
6386  * call-seq:
6387  *   sym.to_proc
6388  *
6389  * Returns a _Proc_ object which respond to the given method by _sym_.
6390  *
6391  *   (1..3).collect(&:to_s)  #=> ["1", "2", "3"]
6392  */
6393
6394 static VALUE
6395 sym_to_proc(VALUE sym)
6396 {
6397     return rb_proc_new(sym_call, (VALUE)SYM2ID(sym));
6398 }
6399
6400
6401 static VALUE
6402 sym_succ(VALUE sym)
6403 {
6404     return rb_str_intern(rb_str_succ(rb_sym_to_s(sym)));
6405 }
6406
6407 static VALUE
6408 sym_cmp(VALUE sym, VALUE other)
6409 {
6410     if (!SYMBOL_P(other)) {
6411         return Qnil;
6412     }
6413     return rb_str_cmp_m(rb_sym_to_s(sym), rb_sym_to_s(other));
6414 }
6415
6416 static VALUE
6417 sym_casecmp(VALUE sym, VALUE other)
6418 {
6419     if (!SYMBOL_P(other)) {
6420         return Qnil;
6421     }
6422     return rb_str_casecmp(rb_sym_to_s(sym), rb_sym_to_s(other));
6423 }
6424
6425 static VALUE
6426 sym_match(VALUE sym, VALUE other)
6427 {
6428     return rb_str_match(rb_sym_to_s(sym), other);
6429 }
6430
6431 static VALUE
6432 sym_eqq(VALUE sym, VALUE other)
6433 {
6434     if (sym == other) return Qtrue;
6435     return rb_str_equal(rb_sym_to_s(sym), other);
6436 }
6437
6438 static VALUE
6439 sym_aref(int argc, VALUE *argv, VALUE sym)
6440 {
6441     return rb_str_aref_m(argc, argv, rb_sym_to_s(sym));
6442 }
6443
6444 static VALUE
6445 sym_length(VALUE sym)
6446 {
6447     return rb_str_length(rb_id2str(SYM2ID(sym)));
6448 }
6449
6450 static VALUE
6451 sym_empty(VALUE sym)
6452 {
6453     return rb_str_empty(rb_id2str(SYM2ID(sym)));
6454 }
6455
6456 static VALUE
6457 sym_upcase(VALUE sym)
6458 {
6459     return rb_str_intern(rb_str_upcase(rb_id2str(SYM2ID(sym))));
6460 }
6461
6462 static VALUE
6463 sym_downcase(VALUE sym)
6464 {
6465     return rb_str_intern(rb_str_downcase(rb_id2str(SYM2ID(sym))));
6466 }
6467
6468 static VALUE
6469 sym_capitalize(VALUE sym)
6470 {
6471     return rb_str_intern(rb_str_capitalize(rb_id2str(SYM2ID(sym))));
6472 }
6473
6474 static VALUE
6475 sym_swapcase(VALUE sym)
6476 {
6477     return rb_str_intern(rb_str_swapcase(rb_id2str(SYM2ID(sym))));
6478 }
6479
6480 static VALUE
6481 sym_encoding(VALUE sym)
6482 {
6483     return rb_obj_encoding(rb_id2str(SYM2ID(sym)));
6484 }
6485
6486 ID
6487 rb_to_id(VALUE name)
6488 {
6489     VALUE tmp;
6490     ID id;
6491
6492     switch (TYPE(name)) {
6493       default:
6494         tmp = rb_check_string_type(name);
6495         if (NIL_P(tmp)) {
6496             rb_raise(rb_eTypeError, "%s is not a symbol",
6497                      RSTRING_PTR(rb_inspect(name)));
6498         }
6499         name = tmp;
6500         /* fall through */
6501       case T_STRING:
6502         name = rb_str_intern(name);
6503         /* fall through */
6504       case T_SYMBOL:
6505         return SYM2ID(name);
6506     }
6507     return id;
6508 }
6509
6510 /*
6511  *  A <code>String</code> object holds and manipulates an arbitrary sequence of
6512  *  bytes, typically representing characters. String objects may be created
6513  *  using <code>String::new</code> or as literals.
6514  *
6515  *  Because of aliasing issues, users of strings should be aware of the methods
6516  *  that modify the contents of a <code>String</code> object.  Typically,
6517  *  methods with names ending in ``!'' modify their receiver, while those
6518  *  without a ``!'' return a new <code>String</code>.  However, there are
6519  *  exceptions, such as <code>String#[]=</code>.
6520  *
6521  */
6522
6523 void
6524 Init_String(void)
6525 {
6526 #undef rb_intern
6527
6528     rb_cString  = rb_define_class("String", rb_cObject);
6529     rb_include_module(rb_cString, rb_mComparable);
6530     rb_define_alloc_func(rb_cString, str_alloc);
6531     rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1);
6532     rb_define_method(rb_cString, "initialize", rb_str_init, -1);
6533     rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
6534     rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1);
6535     rb_define_method(rb_cString, "==", rb_str_equal, 1);
6536     rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
6537     rb_define_method(rb_cString, "hash", rb_str_hash_m, 0);
6538     rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
6539     rb_define_method(rb_cString, "+", rb_str_plus, 1);
6540     rb_define_method(rb_cString, "*", rb_str_times, 1);
6541     rb_define_method(rb_cString, "%", rb_str_format_m, 1);
6542     rb_define_method(rb_cString, "[]", rb_str_aref_m, -1);
6543     rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1);
6544     rb_define_method(rb_cString, "insert", rb_str_insert, 2);
6545     rb_define_method(rb_cString, "length", rb_str_length, 0);
6546     rb_define_method(rb_cString, "size", rb_str_length, 0);
6547     rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
6548     rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
6549     rb_define_method(rb_cString, "=~", rb_str_match, 1);
6550     rb_define_method(rb_cString, "match", rb_str_match_m, -1);
6551     rb_define_method(rb_cString, "succ", rb_str_succ, 0);
6552     rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0);
6553     rb_define_method(rb_cString, "next", rb_str_succ, 0);
6554     rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0);
6555     rb_define_method(rb_cString, "upto", rb_str_upto, -1);
6556     rb_define_method(rb_cString, "index", rb_str_index_m, -1);
6557     rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1);
6558     rb_define_method(rb_cString, "replace", rb_str_replace, 1);
6559     rb_define_method(rb_cString, "clear", rb_str_clear, 0);
6560     rb_define_method(rb_cString, "chr", rb_str_chr, 0);
6561     rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
6562     rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
6563
6564     rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
6565     rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
6566     rb_define_method(rb_cString, "to_s", rb_str_to_s, 0);
6567     rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
6568     rb_define_method(rb_cString, "inspect", rb_str_inspect, 0);
6569     rb_define_method(rb_cString, "dump", rb_str_dump, 0);
6570
6571     rb_define_method(rb_cString, "upcase", rb_str_upcase, 0);
6572     rb_define_method(rb_cString, "downcase", rb_str_downcase, 0);
6573     rb_define_method(rb_cString, "capitalize", rb_str_capitalize, 0);
6574     rb_define_method(rb_cString, "swapcase", rb_str_swapcase, 0);
6575
6576     rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, 0);
6577     rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, 0);
6578     rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, 0);
6579     rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, 0);
6580
6581     rb_define_method(rb_cString, "hex", rb_str_hex, 0);
6582     rb_define_method(rb_cString, "oct", rb_str_oct, 0);
6583     rb_define_method(rb_cString, "split", rb_str_split_m, -1);
6584     rb_define_method(rb_cString, "lines", rb_str_each_line, -1);
6585     rb_define_method(rb_cString, "bytes", rb_str_each_byte, 0);
6586     rb_define_method(rb_cString, "chars", rb_str_each_char, 0);
6587     rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
6588     rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
6589     rb_define_method(rb_cString, "concat", rb_str_concat, 1);
6590     rb_define_method(rb_cString, "<<", rb_str_concat, 1);
6591     rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
6592     rb_define_method(rb_cString, "intern", rb_str_intern, 0);
6593     rb_define_method(rb_cString, "to_sym", rb_str_intern, 0);
6594     rb_define_method(rb_cString, "ord", rb_str_ord, 0);
6595
6596     rb_define_method(rb_cString, "include?", rb_str_include, 1);
6597     rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
6598     rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
6599
6600     rb_define_method(rb_cString, "scan", rb_str_scan, 1);
6601
6602     rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
6603     rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
6604     rb_define_method(rb_cString, "center", rb_str_center, -1);
6605
6606     rb_define_method(rb_cString, "sub", rb_str_sub, -1);
6607     rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
6608     rb_define_method(rb_cString, "chop", rb_str_chop, 0);
6609     rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
6610     rb_define_method(rb_cString, "strip", rb_str_strip, 0);
6611     rb_define_method(rb_cString, "lstrip", rb_str_lstrip, 0);
6612     rb_define_method(rb_cString, "rstrip", rb_str_rstrip, 0);
6613
6614     rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1);
6615     rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1);
6616     rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0);
6617     rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1);
6618     rb_define_method(rb_cString, "strip!", rb_str_strip_bang, 0);
6619     rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, 0);
6620     rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, 0);
6621
6622     rb_define_method(rb_cString, "tr", rb_str_tr, 2);
6623     rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2);
6624     rb_define_method(rb_cString, "delete", rb_str_delete, -1);
6625     rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
6626     rb_define_method(rb_cString, "count", rb_str_count, -1);
6627
6628     rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2);
6629     rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2);
6630     rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1);
6631     rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1);
6632
6633     rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
6634     rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
6635     rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
6636
6637     rb_define_method(rb_cString, "sum", rb_str_sum, -1);
6638
6639     rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
6640     rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1);
6641
6642     rb_define_method(rb_cString, "partition", rb_str_partition, 1);
6643     rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
6644
6645     rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */
6646     rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
6647     rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
6648     rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0);
6649
6650     id_to_s = rb_intern("to_s");
6651
6652     rb_fs = Qnil;
6653     rb_define_variable("$;", &rb_fs);
6654     rb_define_variable("$-F", &rb_fs);
6655
6656     rb_cSymbol = rb_define_class("Symbol", rb_cObject);
6657     rb_include_module(rb_cSymbol, rb_mComparable);
6658     rb_undef_alloc_func(rb_cSymbol);
6659     rb_undef_method(CLASS_OF(rb_cSymbol), "new");
6660     rb_define_singleton_method(rb_cSymbol, "all_symbols", rb_sym_all_symbols, 0); /* in parse.y */
6661
6662     rb_define_method(rb_cSymbol, "==", sym_equal, 1);
6663     rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
6664     rb_define_method(rb_cSymbol, "to_s", rb_sym_to_s, 0);
6665     rb_define_method(rb_cSymbol, "id2name", rb_sym_to_s, 0);
6666     rb_define_method(rb_cSymbol, "intern", sym_to_sym, 0);
6667     rb_define_method(rb_cSymbol, "to_sym", sym_to_sym, 0);
6668     rb_define_method(rb_cSymbol, "to_proc", sym_to_proc, 0);
6669     rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
6670     rb_define_method(rb_cSymbol, "next", sym_succ, 0);
6671
6672     rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
6673     rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
6674     rb_define_method(rb_cSymbol, "=~", sym_match, 1);
6675     rb_define_method(rb_cSymbol, "===", sym_eqq, 1);
6676
6677     rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
6678     rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
6679     rb_define_method(rb_cSymbol, "length", sym_length, 0);
6680     rb_define_method(rb_cSymbol, "size", sym_length, 0);
6681     rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
6682     rb_define_method(rb_cSymbol, "match", sym_match, 1);
6683
6684     rb_define_method(rb_cSymbol, "upcase", sym_upcase, 0);
6685     rb_define_method(rb_cSymbol, "downcase", sym_downcase, 0);
6686     rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, 0);
6687     rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, 0);
6688
6689     rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);
6690 }