string.c

   1 /**********************************************************************
   2
   3   string.c -
   4
   5   $Author$
   6   created at: Mon Aug  9 17:12:58 JST 1993
   7
   8   Copyright (C) 1993-2007 Yukihiro Matsumoto
   9   Copyright (C) 2000  Network Applied Communication Laboratory, Inc.
  10   Copyright (C) 2000  Information-technology Promotion Agency, Japan
  11
  12 **********************************************************************/
  13
  14 #include "ruby/ruby.h"
  15 #include "ruby/re.h"
  16 #include "ruby/encoding.h"
  17
  18 #define BEG(no) regs->beg[no]
  19 #define END(no) regs->end[no]
  20
  21 #include <math.h>
  22 #include <ctype.h>
  23
  24 #ifdef HAVE_UNISTD_H
  25 #include <unistd.h>
  26 #endif
  27
  28 #undef rb_str_new2
  29 #undef rb_tainted_str_new2
  30 #undef rb_usascii_str_new2
  31 #undef rb_str_buf_new2
  32 #undef rb_str_buf_cat2
  33 #undef rb_str_cat2
  34
  35 VALUE rb_cString;
  36 VALUE rb_cSymbol;
  37
  38 #define STR_TMPLOCK FL_USER7
  39 #define STR_NOEMBED FL_USER1
  40 #define STR_SHARED  FL_USER2 /* = ELTS_SHARED */
  41 #define STR_ASSOC   FL_USER3
  42 #define STR_SHARED_P(s) FL_ALL(s, STR_NOEMBED|ELTS_SHARED)
  43 #define STR_ASSOC_P(s)  FL_ALL(s, STR_NOEMBED|STR_ASSOC)
  44 #define STR_NOCAPA  (STR_NOEMBED|ELTS_SHARED|STR_ASSOC)
  45 #define STR_NOCAPA_P(s) (FL_TEST(s,STR_NOEMBED) && FL_ANY(s,ELTS_SHARED|STR_ASSOC))
  46 #define STR_UNSET_NOCAPA(s) do {\
  47     if (FL_TEST(s,STR_NOEMBED)) FL_UNSET(s,(ELTS_SHARED|STR_ASSOC));\
  48 } while (0)
  49
  50
  51 #define STR_SET_NOEMBED(str) do {\
  52     FL_SET(str, STR_NOEMBED);\
  53     STR_SET_EMBED_LEN(str, 0);\
  54 } while (0)
  55 #define STR_SET_EMBED(str) FL_UNSET(str, STR_NOEMBED)
  56 #define STR_EMBED_P(str) (!FL_TEST(str, STR_NOEMBED))
  57 #define STR_SET_EMBED_LEN(str, n) do { \
  58     long tmp_n = (n);\
  59     RBASIC(str)->flags &= ~RSTRING_EMBED_LEN_MASK;\
  60     RBASIC(str)->flags |= (tmp_n) << RSTRING_EMBED_LEN_SHIFT;\
  61 } while (0)
  62
  63 #define STR_SET_LEN(str, n) do { \
  64     if (STR_EMBED_P(str)) {\
  65         STR_SET_EMBED_LEN(str, n);\
  66     }\
  67     else {\
  68         RSTRING(str)->as.heap.len = (n);\
  69     }\
  70 } while (0)
  71
  72 #define STR_DEC_LEN(str) do {\
  73     if (STR_EMBED_P(str)) {\
  74         long n = RSTRING_LEN(str);\
  75         n--;\
  76         STR_SET_EMBED_LEN(str, n);\
  77     }\
  78     else {\
  79         RSTRING(str)->as.heap.len--;\
  80     }\
  81 } while (0)
  82
  83 #define RESIZE_CAPA(str,capacity) do {\
  84     if (STR_EMBED_P(str)) {\
  85         if ((capacity) > RSTRING_EMBED_LEN_MAX) {\
  86             char *tmp = ALLOC_N(char, capacity+1);\
  87             memcpy(tmp, RSTRING_PTR(str), RSTRING_LEN(str));\
  88             RSTRING(str)->as.heap.ptr = tmp;\
  89             RSTRING(str)->as.heap.len = RSTRING_LEN(str);\
  90             STR_SET_NOEMBED(str);\
  91             RSTRING(str)->as.heap.aux.capa = (capacity);\
  92         }\
  93     }\
  94     else {\
  95         REALLOC_N(RSTRING(str)->as.heap.ptr, char, (capacity)+1);\
  96         if (!STR_NOCAPA_P(str))\
  97             RSTRING(str)->as.heap.aux.capa = (capacity);\
  98     }\
  99 } while (0)
 100
 101 #define is_ascii_string(str) (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT)
 102 #define is_broken_string(str) (rb_enc_str_coderange(str) == ENC_CODERANGE_BROKEN)
 103
 104 #define STR_ENC_GET(str) rb_enc_from_index(ENCODING_GET(str))
 105
 106 static int
 107 single_byte_optimizable(VALUE str)
 108 {
 109     rb_encoding *enc = STR_ENC_GET(str);
 110
 111     if (rb_enc_mbmaxlen(enc) == 1)
 112         return 1;
 113
 114     /* Conservative.  It may be ENC_CODERANGE_UNKNOWN. */
 115     if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT)
 116         return 1;
 117
 118     /* Conservative.  Possibly single byte.
 119      * "\xa1" in Shift_JIS for example. */
 120     return 0;
 121 }
 122
 123 VALUE rb_fs;
 124
 125 static inline const char *
 126 search_nonascii(const char *p, const char *e)
 127 {
 128 #if SIZEOF_VALUE == 8
 129 # define NONASCII_MASK 0x8080808080808080LL
 130 #elif SIZEOF_VALUE == 4
 131 # define NONASCII_MASK 0x80808080UL
 132 #endif
 133 #ifdef NONASCII_MASK
 134     if (sizeof(VALUE) * 2 < e - p) {
 135         const VALUE *s, *t;
 136         const VALUE lowbits = sizeof(VALUE) - 1;
 137         s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
 138         while (p < (const char *)s) {
 139             if (!ISASCII(*p))
 140                 return p;
 141             p++;
 142         }
 143         t = (const VALUE*)(~lowbits & (VALUE)e);
 144         while (s < t) {
 145             if (*s & NONASCII_MASK) {
 146                 t = s;
 147                 break;
 148             }
 149             s++;
 150         }
 151         p = (const char *)t;
 152     }
 153 #endif
 154     while (p < e) {
 155         if (!ISASCII(*p))
 156             return p;
 157         p++;
 158     }
 159     return NULL;
 160 }
 161
 162 static int
 163 coderange_scan(const char *p, long len, rb_encoding *enc)
 164 {
 165     const char *e = p + len;
 166
 167     if (rb_enc_to_index(enc) == 0) {
 168         /* enc is ASCII-8BIT.  ASCII-8BIT string never be broken. */
 169         p = search_nonascii(p, e);
 170         return p ? ENC_CODERANGE_VALID : ENC_CODERANGE_7BIT;
 171     }
 172
 173     if (rb_enc_asciicompat(enc)) {
 174         p = search_nonascii(p, e);
 175         if (!p) {
 176             return ENC_CODERANGE_7BIT;
 177         }
 178         while (p < e) {
 179             int ret = rb_enc_precise_mbclen(p, e, enc);
 180             if (!MBCLEN_CHARFOUND_P(ret)) {
 181                 return ENC_CODERANGE_BROKEN;
 182             }
 183             p += MBCLEN_CHARFOUND_LEN(ret);
 184             if (p < e) {
 185                 p = search_nonascii(p, e);
 186                 if (!p) {
 187                     return ENC_CODERANGE_VALID;
 188                 }
 189             }
 190         }
 191         if (e < p) {
 192             return ENC_CODERANGE_BROKEN;
 193         }
 194         return ENC_CODERANGE_VALID;
 195     }
 196
 197     while (p < e) {
 198         int ret = rb_enc_precise_mbclen(p, e, enc);
 199
 200         if (!MBCLEN_CHARFOUND_P(ret)) {
 201             return ENC_CODERANGE_BROKEN;
 202         }
 203         p += MBCLEN_CHARFOUND_LEN(ret);
 204     }
 205     if (e < p) {
 206         return ENC_CODERANGE_BROKEN;
 207     }
 208     return ENC_CODERANGE_VALID;
 209 }
 210
 211 long
 212 rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
 213 {
 214     const char *p = s;
 215
 216     if (*cr == ENC_CODERANGE_BROKEN)
 217         return e - s;
 218
 219     if (rb_enc_to_index(enc) == 0) {
 220         /* enc is ASCII-8BIT.  ASCII-8BIT string never be broken. */
 221         p = search_nonascii(p, e);
 222         *cr = (!p && *cr != ENC_CODERANGE_VALID) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
 223         return e - s;
 224     }
 225     else if (rb_enc_asciicompat(enc)) {
 226         p = search_nonascii(p, e);
 227         if (!p) {
 228             if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
 229             return e - s;
 230         }
 231         while (p < e) {
 232             int ret = rb_enc_precise_mbclen(p, e, enc);
 233             if (!MBCLEN_CHARFOUND_P(ret)) {
 234                 *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN;
 235                 return p - s;
 236             }
 237             p += MBCLEN_CHARFOUND_LEN(ret);
 238             if (p < e) {
 239                 p = search_nonascii(p, e);
 240                 if (!p) {
 241                     *cr = ENC_CODERANGE_VALID;
 242                     return e - s;
 243                 }
 244             }
 245         }
 246         *cr = e < p ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_VALID;
 247         return p - s;
 248     }
 249     else {
 250         while (p < e) {
 251             int ret = rb_enc_precise_mbclen(p, e, enc);
 252             if (!MBCLEN_CHARFOUND_P(ret)) {
 253                 *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN;
 254                 return p - s;
 255             }
 256             p += MBCLEN_CHARFOUND_LEN(ret);
 257         }
 258         *cr = e < p ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_VALID;
 259         return p - s;
 260     }
 261 }
 262
 263 static inline void
 264 str_enc_copy(VALUE str1, VALUE str2)
 265 {
 266     rb_enc_set_index(str1, ENCODING_GET(str2));
 267 }
 268
 269 static void
 270 rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
 271 {
 272     /* this function is designed for copying encoding and coderange
 273      * from src to new string "dest" which is made from the part of src.
 274      */
 275     str_enc_copy(dest, src);
 276     switch (ENC_CODERANGE(src)) {
 277       case ENC_CODERANGE_7BIT:
 278         ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
 279         break;
 280       case ENC_CODERANGE_VALID:
 281         if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
 282             search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
 283             ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID);
 284         else
 285             ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
 286         break;
 287       default:
 288         if (RSTRING_LEN(dest) == 0) {
 289             if (!rb_enc_asciicompat(STR_ENC_GET(src)))
 290                 ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID);
 291             else
 292                 ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
 293         }
 294         break;
 295     }
 296 }
 297
 298 static void
 299 rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
 300 {
 301     str_enc_copy(dest, src);
 302     ENC_CODERANGE_SET(dest, ENC_CODERANGE(src));
 303 }
 304
 305 int
 306 rb_enc_str_coderange(VALUE str)
 307 {
 308     int cr = ENC_CODERANGE(str);
 309
 310     if (cr == ENC_CODERANGE_UNKNOWN) {
 311         rb_encoding *enc = STR_ENC_GET(str);
 312         cr = coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
 313         ENC_CODERANGE_SET(str, cr);
 314     }
 315     return cr;
 316 }
 317
 318 int
 319 rb_enc_str_asciionly_p(VALUE str)
 320 {
 321     rb_encoding *enc = STR_ENC_GET(str);
 322
 323     if (!rb_enc_asciicompat(enc))
 324         return Qfalse;
 325     else if (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT)
 326         return Qtrue;
 327     return Qfalse;
 328 }
 329
 330 static inline void
 331 str_mod_check(VALUE s, const char *p, long len)
 332 {
 333     if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
 334         rb_raise(rb_eRuntimeError, "string modified");
 335     }
 336 }
 337
 338 static inline void
 339 str_frozen_check(VALUE s)
 340 {
 341     if (OBJ_FROZEN(s)) {
 342         rb_raise(rb_eRuntimeError, "string frozen");
 343     }
 344 }
 345
 346 size_t
 347 rb_str_capacity(VALUE str)
 348 {
 349     if (STR_EMBED_P(str)) {
 350         return RSTRING_EMBED_LEN_MAX;
 351     }
 352     else if (STR_NOCAPA_P(str)) {
 353         return RSTRING(str)->as.heap.len;
 354     }
 355     else {
 356         return RSTRING(str)->as.heap.aux.capa;
 357     }
 358 }
 359
 360 static inline VALUE
 361 str_alloc(VALUE klass)
 362 {
 363     NEWOBJ(str, struct RString);
 364     OBJSETUP(str, klass, T_STRING);
 365
 366     str->as.heap.ptr = 0;
 367     str->as.heap.len = 0;
 368     str->as.heap.aux.capa = 0;
 369
 370     return (VALUE)str;
 371 }
 372
 373 static VALUE
 374 str_new(VALUE klass, const char *ptr, long len)
 375 {
 376     VALUE str;
 377
 378     if (len < 0) {
 379         rb_raise(rb_eArgError, "negative string size (or size too big)");
 380     }
 381
 382     str = str_alloc(klass);
 383     if (len > RSTRING_EMBED_LEN_MAX) {
 384         RSTRING(str)->as.heap.aux.capa = len;
 385         RSTRING(str)->as.heap.ptr = ALLOC_N(char,len+1);
 386         STR_SET_NOEMBED(str);
 387     }
 388     if (ptr) {
 389         memcpy(RSTRING_PTR(str), ptr, len);
 390     }
 391     STR_SET_LEN(str, len);
 392     RSTRING_PTR(str)[len] = '\0';
 393     return str;
 394 }
 395
 396 VALUE
 397 rb_str_new(const char *ptr, long len)
 398 {
 399     return str_new(rb_cString, ptr, len);
 400 }
 401
 402 VALUE
 403 rb_usascii_str_new(const char *ptr, long len)
 404 {
 405     VALUE str = rb_str_new(ptr, len);
 406     ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
 407     return str;
 408 }
 409
 410 VALUE
 411 rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
 412 {
 413     VALUE str = rb_str_new(ptr, len);
 414     rb_enc_associate(str, enc);
 415     return str;
 416 }
 417
 418 VALUE
 419 rb_str_new2(const char *ptr)
 420 {
 421     if (!ptr) {
 422         rb_raise(rb_eArgError, "NULL pointer given");
 423     }
 424     return rb_str_new(ptr, strlen(ptr));
 425 }
 426
 427 VALUE
 428 rb_usascii_str_new2(const char *ptr)
 429 {
 430     VALUE str = rb_str_new2(ptr);
 431     ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
 432     return str;
 433 }
 434
 435 VALUE
 436 rb_tainted_str_new(const char *ptr, long len)
 437 {
 438     VALUE str = rb_str_new(ptr, len);
 439
 440     OBJ_TAINT(str);
 441     return str;
 442 }
 443
 444 VALUE
 445 rb_tainted_str_new2(const char *ptr)
 446 {
 447     VALUE str = rb_str_new2(ptr);
 448
 449     OBJ_TAINT(str);
 450     return str;
 451 }
 452
 453 static VALUE
 454 str_replace_shared(VALUE str2, VALUE str)
 455 {
 456     if (RSTRING_LEN(str) <= RSTRING_EMBED_LEN_MAX) {
 457         STR_SET_EMBED(str2);
 458         memcpy(RSTRING_PTR(str2), RSTRING_PTR(str), RSTRING_LEN(str)+1);
 459         STR_SET_EMBED_LEN(str2, RSTRING_LEN(str));
 460     }
 461     else {
 462         FL_SET(str2, STR_NOEMBED);
 463         RSTRING(str2)->as.heap.len = RSTRING_LEN(str);
 464         RSTRING(str2)->as.heap.ptr = RSTRING_PTR(str);
 465         RSTRING(str2)->as.heap.aux.shared = str;
 466         FL_SET(str2, ELTS_SHARED);
 467     }
 468     rb_enc_cr_str_exact_copy(str2, str);
 469
 470     return str2;
 471 }
 472
 473 static VALUE
 474 str_new_shared(VALUE klass, VALUE str)
 475 {
 476     return str_replace_shared(str_alloc(klass), str);
 477 }
 478
 479 static VALUE
 480 str_new3(VALUE klass, VALUE str)
 481 {
 482     return str_new_shared(klass, str);
 483 }
 484
 485 VALUE
 486 rb_str_new3(VALUE str)
 487 {
 488     VALUE str2 = str_new3(rb_obj_class(str), str);
 489
 490     OBJ_INFECT(str2, str);
 491     return str2;
 492 }
 493
 494 static VALUE
 495 str_new4(VALUE klass, VALUE str)
 496 {
 497     VALUE str2;
 498
 499     str2 = str_alloc(klass);
 500     STR_SET_NOEMBED(str2);
 501     RSTRING(str2)->as.heap.len = RSTRING_LEN(str);
 502     RSTRING(str2)->as.heap.ptr = RSTRING_PTR(str);
 503     if (STR_SHARED_P(str)) {
 504         FL_SET(str2, ELTS_SHARED);
 505         RSTRING(str2)->as.heap.aux.shared = RSTRING(str)->as.heap.aux.shared;
 506     }
 507     else {
 508         FL_SET(str, ELTS_SHARED);
 509         RSTRING(str)->as.heap.aux.shared = str2;
 510     }
 511     rb_enc_cr_str_exact_copy(str2, str);
 512     OBJ_INFECT(str2, str);
 513     return str2;
 514 }
 515
 516 VALUE
 517 rb_str_new4(VALUE orig)
 518 {
 519     VALUE klass, str;
 520
 521     if (OBJ_FROZEN(orig)) return orig;
 522     klass = rb_obj_class(orig);
 523     if (STR_SHARED_P(orig) && (str = RSTRING(orig)->as.heap.aux.shared)) {
 524         long ofs;
 525         ofs = RSTRING_LEN(str) - RSTRING_LEN(orig);
 526         if ((ofs > 0) || (klass != RBASIC(str)->klass) ||
 527             (!OBJ_TAINTED(str) && OBJ_TAINTED(orig))) {
 528             str = str_new3(klass, str);
 529             RSTRING(str)->as.heap.ptr += ofs;
 530             RSTRING(str)->as.heap.len -= ofs;
 531         }
 532         rb_enc_cr_str_exact_copy(str, orig);
 533         OBJ_INFECT(str, orig);
 534     }
 535     else if (STR_EMBED_P(orig)) {
 536         str = str_new(klass, RSTRING_PTR(orig), RSTRING_LEN(orig));
 537         rb_enc_cr_str_exact_copy(str, orig);
 538         OBJ_INFECT(str, orig);
 539     }
 540     else if (STR_ASSOC_P(orig)) {
 541         VALUE assoc = RSTRING(orig)->as.heap.aux.shared;
 542         FL_UNSET(orig, STR_ASSOC);
 543         str = str_new4(klass, orig);
 544         FL_SET(str, STR_ASSOC);
 545         RSTRING(str)->as.heap.aux.shared = assoc;
 546     }
 547     else {
 548         str = str_new4(klass, orig);
 549     }
 550     OBJ_FREEZE(str);
 551     return str;
 552 }
 553
 554 VALUE
 555 rb_str_new5(VALUE obj, const char *ptr, long len)
 556 {
 557     return str_new(rb_obj_class(obj), ptr, len);
 558 }
 559
 560 #define STR_BUF_MIN_SIZE 128
 561
 562 VALUE
 563 rb_str_buf_new(long capa)
 564 {
 565     VALUE str = str_alloc(rb_cString);
 566
 567     if (capa < STR_BUF_MIN_SIZE) {
 568         capa = STR_BUF_MIN_SIZE;
 569     }
 570     FL_SET(str, STR_NOEMBED);
 571     RSTRING(str)->as.heap.aux.capa = capa;
 572     RSTRING(str)->as.heap.ptr = ALLOC_N(char, capa+1);
 573     RSTRING(str)->as.heap.ptr[0] = '\0';
 574
 575     return str;
 576 }
 577
 578 VALUE
 579 rb_str_buf_new2(const char *ptr)
 580 {
 581     VALUE str;
 582     long len = strlen(ptr);
 583
 584     str = rb_str_buf_new(len);
 585     rb_str_buf_cat(str, ptr, len);
 586
 587     return str;
 588 }
 589
 590 VALUE
 591 rb_str_tmp_new(long len)
 592 {
 593     return str_new(0, 0, len);
 594 }
 595
 596 void
 597 rb_str_free(VALUE str)
 598 {
 599     if (!STR_EMBED_P(str) && !STR_SHARED_P(str)) {
 600         xfree(RSTRING(str)->as.heap.ptr);
 601     }
 602 }
 603
 604 VALUE
 605 rb_str_to_str(VALUE str)
 606 {
 607     return rb_convert_type(str, T_STRING, "String", "to_str");
 608 }
 609
 610 void
 611 rb_str_shared_replace(VALUE str, VALUE str2)
 612 {
 613     rb_encoding *enc;
 614     int cr;
 615     if (str == str2) return;
 616     enc = STR_ENC_GET(str2);
 617     cr = ENC_CODERANGE(str2);
 618     rb_str_modify(str);
 619     if (OBJ_TAINTED(str2)) OBJ_TAINT(str);
 620     if (RSTRING_LEN(str2) <= RSTRING_EMBED_LEN_MAX) {
 621         STR_SET_EMBED(str);
 622         memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), RSTRING_LEN(str2)+1);
 623         STR_SET_EMBED_LEN(str, RSTRING_LEN(str2));
 624         rb_enc_associate(str, enc);
 625         ENC_CODERANGE_SET(str, cr);
 626         return;
 627     }
 628     if (!STR_SHARED_P(str) && !STR_EMBED_P(str)) {
 629         xfree(RSTRING_PTR(str));
 630     }
 631     STR_SET_NOEMBED(str);
 632     STR_UNSET_NOCAPA(str);
 633     RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
 634     RSTRING(str)->as.heap.len = RSTRING_LEN(str2);
 635     if (STR_NOCAPA_P(str2)) {
 636         FL_SET(str, RBASIC(str2)->flags & STR_NOCAPA);
 637         RSTRING(str)->as.heap.aux.shared = RSTRING(str2)->as.heap.aux.shared;
 638     }
 639     else {
 640         RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
 641     }
 642     RSTRING(str2)->as.heap.ptr = 0;     /* abandon str2 */
 643     RSTRING(str2)->as.heap.len = 0;
 644     RSTRING(str2)->as.heap.aux.capa = 0;
 645     STR_UNSET_NOCAPA(str2);
 646     rb_enc_associate(str, enc);
 647     ENC_CODERANGE_SET(str, cr);
 648 }
 649
 650 static ID id_to_s;
 651
 652 VALUE
 653 rb_obj_as_string(VALUE obj)
 654 {
 655     VALUE str;
 656
 657     if (TYPE(obj) == T_STRING) {
 658         return obj;
 659     }
 660     str = rb_funcall(obj, id_to_s, 0);
 661     if (TYPE(str) != T_STRING)
 662         return rb_any_to_s(obj);
 663     if (OBJ_TAINTED(obj)) OBJ_TAINT(str);
 664     return str;
 665 }
 666
 667 static VALUE rb_str_replace(VALUE, VALUE);
 668
 669 VALUE
 670 rb_str_dup(VALUE str)
 671 {
 672     VALUE dup = str_alloc(rb_obj_class(str));
 673     rb_str_replace(dup, str);
 674     return dup;
 675 }
 676
 677
 678 /*
 679  *  call-seq:
 680  *     String.new(str="")   => new_str
 681  *
 682  *  Returns a new string object containing a copy of <i>str</i>.
 683  */
 684
 685 static VALUE
 686 rb_str_init(int argc, VALUE *argv, VALUE str)
 687 {
 688     VALUE orig;
 689
 690     if (argc > 0 && rb_scan_args(argc, argv, "01", &orig) == 1)
 691         rb_str_replace(str, orig);
 692     return str;
 693 }
 694
 695 long
 696 rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
 697 {
 698     long c;
 699     const char *q;
 700
 701     if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
 702         return (e - p + rb_enc_mbminlen(enc) - 1) / rb_enc_mbminlen(enc);
 703     }
 704     else if (rb_enc_asciicompat(enc)) {
 705         c = 0;
 706         while (p < e) {
 707             if (ISASCII(*p)) {
 708                 q = search_nonascii(p, e);
 709                 if (!q)
 710                     return c + (e - p);
 711                 c += q - p;
 712                 p = q;
 713             }
 714             p += rb_enc_mbclen(p, e, enc);
 715             c++;
 716         }
 717         return c;
 718     }
 719
 720     for (c=0; p<e; c++) {
 721         p += rb_enc_mbclen(p, e, enc);
 722     }
 723     return c;
 724 }
 725
 726 long
 727 rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
 728 {
 729     long c;
 730     const char *q;
 731     int ret;
 732
 733     *cr = 0;
 734     if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
 735         return (e - p + rb_enc_mbminlen(enc) - 1) / rb_enc_mbminlen(enc);
 736     }
 737     else if (rb_enc_asciicompat(enc)) {
 738         c = 0;
 739         while (p < e) {
 740             if (ISASCII(*p)) {
 741                 q = search_nonascii(p, e);
 742                 if (!q) {
 743                     if (!*cr) *cr = ENC_CODERANGE_7BIT;
 744                     return c + (e - p);
 745                 }
 746                 c += q - p;
 747                 p = q;
 748             }
 749             ret = rb_enc_precise_mbclen(p, e, enc);
 750             if (MBCLEN_CHARFOUND_P(ret)) {
 751                 *cr |= ENC_CODERANGE_VALID;
 752                 p += MBCLEN_CHARFOUND_LEN(ret);
 753             }
 754             else {
 755                 *cr = ENC_CODERANGE_BROKEN;
 756                 p++;
 757             }
 758             c++;
 759         }
 760         if (!*cr) *cr = ENC_CODERANGE_7BIT;
 761         return c;
 762     }
 763
 764     for (c=0; p<e; c++) {
 765         ret = rb_enc_precise_mbclen(p, e, enc);
 766         if (MBCLEN_CHARFOUND_P(ret)) {
 767             *cr |= ENC_CODERANGE_VALID;
 768             p += MBCLEN_CHARFOUND_LEN(ret);
 769         }
 770         else {
 771             *cr = ENC_CODERANGE_BROKEN;
 772             p++;
 773         }
 774     }
 775     if (!*cr) *cr = ENC_CODERANGE_7BIT;
 776     return c;
 777 }
 778
 779 #ifdef NONASCII_MASK
 780 #define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
 781 static inline VALUE
 782 count_utf8_lead_bytes_with_word(const VALUE *s)
 783 {
 784     VALUE d = *s;
 785     d |= ~(d>>1);
 786     d >>= 6;
 787     d &= NONASCII_MASK >> 7;
 788     d += (d>>8);
 789     d += (d>>16);
 790 #if SIZEOF_VALUE == 8
 791     d += (d>>32);
 792 #endif
 793     return (d&0xF);
 794 }
 795 #endif
 796
 797 static long
 798 str_strlen(VALUE str, rb_encoding *enc)
 799 {
 800     const char *p, *e;
 801     int n, cr;
 802
 803     if (single_byte_optimizable(str)) return RSTRING_LEN(str);
 804     if (!enc) enc = STR_ENC_GET(str);
 805     p = RSTRING_PTR(str);
 806     e = RSTRING_END(str);
 807 #ifdef NONASCII_MASK
 808     if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
 809         enc == rb_utf8_encoding()) {
 810         VALUE len = 0;
 811         if (sizeof(VALUE) * 2 < e - p) {
 812             const VALUE *s, *t;
 813             const VALUE lowbits = sizeof(VALUE) - 1;
 814             s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
 815             t = (const VALUE*)(~lowbits & (VALUE)e);
 816             while (p < (const char *)s) {
 817                 if (is_utf8_lead_byte(*p)) len++;
 818                 p++;
 819             }
 820             while (s < t) {
 821                 len += count_utf8_lead_bytes_with_word(s);
 822                 s++;
 823             }
 824             p = (const char *)s;
 825         }
 826         while (p < e) {
 827             if (is_utf8_lead_byte(*p)) len++;
 828             p++;
 829         }
 830         return (long)len;
 831     }
 832 #endif
 833     n = rb_enc_strlen_cr(p, e, enc, &cr);
 834     if (cr) {
 835         ENC_CODERANGE_SET(str, cr);
 836     }
 837     return n;
 838 }
 839
 840 /*
 841  *  call-seq:
 842  *     str.length   => integer
 843  *     str.size     => integer
 844  *
 845  *  Returns the character length of <i>str</i>.
 846  */
 847
 848 VALUE
 849 rb_str_length(VALUE str)
 850 {
 851     int len;
 852
 853     len = str_strlen(str, STR_ENC_GET(str));
 854     return INT2NUM(len);
 855 }
 856
 857 /*
 858  *  call-seq:
 859  *     str.bytesize  => integer
 860  *
 861  *  Returns the length of <i>str</i> in bytes.
 862  */
 863
 864 static VALUE
 865 rb_str_bytesize(VALUE str)
 866 {
 867     return INT2NUM(RSTRING_LEN(str));
 868 }
 869
 870 /*
 871  *  call-seq:
 872  *     str.empty?   => true or false
 873  *
 874  *  Returns <code>true</code> if <i>str</i> has a length of zero.
 875  *
 876  *     "hello".empty?   #=> false
 877  *     "".empty?        #=> true
 878  */
 879
 880 static VALUE
 881 rb_str_empty(VALUE str)
 882 {
 883     if (RSTRING_LEN(str) == 0)
 884         return Qtrue;
 885     return Qfalse;
 886 }
 887
 888 /*
 889  *  call-seq:
 890  *     str + other_str   => new_str
 891  *
 892  *  Concatenation---Returns a new <code>String</code> containing
 893  *  <i>other_str</i> concatenated to <i>str</i>.
 894  *
 895  *     "Hello from " + self.to_s   #=> "Hello from main"
 896  */
 897
 898 VALUE
 899 rb_str_plus(VALUE str1, VALUE str2)
 900 {
 901     VALUE str3;
 902     rb_encoding *enc;
 903
 904     StringValue(str2);
 905     enc = rb_enc_check(str1, str2);
 906     str3 = rb_str_new(0, RSTRING_LEN(str1)+RSTRING_LEN(str2));
 907     memcpy(RSTRING_PTR(str3), RSTRING_PTR(str1), RSTRING_LEN(str1));
 908     memcpy(RSTRING_PTR(str3) + RSTRING_LEN(str1),
 909            RSTRING_PTR(str2), RSTRING_LEN(str2));
 910     RSTRING_PTR(str3)[RSTRING_LEN(str3)] = '\0';
 911
 912     if (OBJ_TAINTED(str1) || OBJ_TAINTED(str2))
 913         OBJ_TAINT(str3);
 914     ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc),
 915                            ENC_CODERANGE_AND(ENC_CODERANGE(str1), ENC_CODERANGE(str2)));
 916     return str3;
 917 }
 918
 919 /*
 920  *  call-seq:
 921  *     str * integer   => new_str
 922  *
 923  *  Copy---Returns a new <code>String</code> containing <i>integer</i> copies of
 924  *  the receiver.
 925  *
 926  *     "Ho! " * 3   #=> "Ho! Ho! Ho! "
 927  */
 928
 929 VALUE
 930 rb_str_times(VALUE str, VALUE times)
 931 {
 932     VALUE str2;
 933     long n, len;
 934
 935     len = NUM2LONG(times);
 936     if (len < 0) {
 937         rb_raise(rb_eArgError, "negative argument");
 938     }
 939     if (len && LONG_MAX/len <  RSTRING_LEN(str)) {
 940         rb_raise(rb_eArgError, "argument too big");
 941     }
 942
 943     str2 = rb_str_new5(str, 0, len *= RSTRING_LEN(str));
 944     if (len) {
 945         n = RSTRING_LEN(str);
 946         memcpy(RSTRING_PTR(str2), RSTRING_PTR(str), n);
 947         while (n <= len/2) {
 948             memcpy(RSTRING_PTR(str2) + n, RSTRING_PTR(str2), n);
 949             n *= 2;
 950         }
 951         memcpy(RSTRING_PTR(str2) + n, RSTRING_PTR(str2), len-n);
 952     }
 953     RSTRING_PTR(str2)[RSTRING_LEN(str2)] = '\0';
 954     OBJ_INFECT(str2, str);
 955     rb_enc_cr_str_copy_for_substr(str2, str);
 956
 957     return str2;
 958 }
 959
 960 /*
 961  *  call-seq:
 962  *     str % arg   => new_str
 963  *
 964  *  Format---Uses <i>str</i> as a format specification, and returns the result
 965  *  of applying it to <i>arg</i>. If the format specification contains more than
 966  *  one substitution, then <i>arg</i> must be an <code>Array</code> containing
 967  *  the values to be substituted. See <code>Kernel::sprintf</code> for details
 968  *  of the format string.
 969  *
 970  *     "%05d" % 123                              #=> "00123"
 971  *     "%-5s: %08x" % [ "ID", self.object_id ]   #=> "ID   : 200e14d6"
 972  */
 973
 974 static VALUE
 975 rb_str_format_m(VALUE str, VALUE arg)
 976 {
 977     volatile VALUE tmp = rb_check_array_type(arg);
 978
 979     if (!NIL_P(tmp)) {
 980         return rb_str_format(RARRAY_LEN(tmp), RARRAY_PTR(tmp), str);
 981     }
 982     return rb_str_format(1, &arg, str);
 983 }
 984
 985 static inline void
 986 str_modifiable(VALUE str)
 987 {
 988     if (FL_TEST(str, STR_TMPLOCK)) {
 989         rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
 990     }
 991     if (OBJ_FROZEN(str)) rb_error_frozen("string");
 992     if (!OBJ_TAINTED(str) && rb_safe_level() >= 4)
 993         rb_raise(rb_eSecurityError, "Insecure: can't modify string");
 994 }
 995
 996 static inline int
 997 str_independent(VALUE str)
 998 {
 999     str_modifiable(str);
1000     if (!STR_SHARED_P(str)) return 1;
1001     if (STR_EMBED_P(str)) return 1;
1002     return 0;
1003 }
1004
1005 static void
1006 str_make_independent(VALUE str)
1007 {
1008     char *ptr;
1009     long len = RSTRING_LEN(str);
1010
1011     ptr = ALLOC_N(char, len+1);
1012     if (RSTRING_PTR(str)) {
1013         memcpy(ptr, RSTRING_PTR(str), len);
1014     }
1015     STR_SET_NOEMBED(str);
1016     ptr[len] = 0;
1017     RSTRING(str)->as.heap.ptr = ptr;
1018     RSTRING(str)->as.heap.len = len;
1019     RSTRING(str)->as.heap.aux.capa = len;
1020     STR_UNSET_NOCAPA(str);
1021 }
1022
1023 void
1024 rb_str_modify(VALUE str)
1025 {
1026     if (!str_independent(str))
1027         str_make_independent(str);
1028     ENC_CODERANGE_CLEAR(str);
1029 }
1030
1031 void
1032 rb_str_associate(VALUE str, VALUE add)
1033 {
1034     /* sanity check */
1035     if (OBJ_FROZEN(str)) rb_error_frozen("string");
1036     if (STR_ASSOC_P(str)) {
1037         /* already associated */
1038         rb_ary_concat(RSTRING(str)->as.heap.aux.shared, add);
1039     }
1040     else {
1041         if (STR_SHARED_P(str)) {
1042             VALUE assoc = RSTRING(str)->as.heap.aux.shared;
1043             str_make_independent(str);
1044             if (STR_ASSOC_P(assoc)) {
1045                 assoc = RSTRING(assoc)->as.heap.aux.shared;
1046                 rb_ary_concat(assoc, add);
1047                 add = assoc;
1048             }
1049         }
1050         else if (STR_EMBED_P(str)) {
1051             str_make_independent(str);
1052         }
1053         else if (RSTRING(str)->as.heap.aux.capa != RSTRING_LEN(str)) {
1054             RESIZE_CAPA(str, RSTRING_LEN(str));
1055         }
1056         FL_SET(str, STR_ASSOC);
1057         RBASIC(add)->klass = 0;
1058         RSTRING(str)->as.heap.aux.shared = add;
1059     }
1060 }
1061
1062 VALUE
1063 rb_str_associated(VALUE str)
1064 {
1065     if (STR_SHARED_P(str)) str = RSTRING(str)->as.heap.aux.shared;
1066     if (STR_ASSOC_P(str)) {
1067         return RSTRING(str)->as.heap.aux.shared;
1068     }
1069     return Qfalse;
1070 }
1071
1072 VALUE
1073 rb_string_value(volatile VALUE *ptr)
1074 {
1075     VALUE s = *ptr;
1076     if (TYPE(s) != T_STRING) {
1077         s = rb_str_to_str(s);
1078         *ptr = s;
1079     }
1080     return s;
1081 }
1082
1083 char *
1084 rb_string_value_ptr(volatile VALUE *ptr)
1085 {
1086     return RSTRING_PTR(rb_string_value(ptr));
1087 }
1088
1089 char *
1090 rb_string_value_cstr(volatile VALUE *ptr)
1091 {
1092     VALUE str = rb_string_value(ptr);
1093     char *s = RSTRING_PTR(str);
1094
1095     if (!s || RSTRING_LEN(str) != strlen(s)) {
1096         rb_raise(rb_eArgError, "string contains null byte");
1097     }
1098     return s;
1099 }
1100
1101 VALUE
1102 rb_check_string_type(VALUE str)
1103 {
1104     str = rb_check_convert_type(str, T_STRING, "String", "to_str");
1105     return str;
1106 }
1107
1108 /*
1109  *  call-seq:
1110  *     String.try_convert(obj) -> string or nil
1111  *
1112  *  Try to convert <i>obj</i> into a String, using to_str method.
1113  *  Returns converted regexp or nil if <i>obj</i> cannot be converted
1114  *  for any reason.
1115  *
1116  *     String.try_convert("str")     # => str
1117  *     String.try_convert(/re/)      # => nil
1118  */
1119 static VALUE
1120 rb_str_s_try_convert(VALUE dummy, VALUE str)
1121 {
1122     return rb_check_string_type(str);
1123 }
1124
1125 char*
1126 rb_enc_nth(const char *p, const char *e, int nth, rb_encoding *enc)
1127 {
1128     if (rb_enc_mbmaxlen(enc) == 1) {
1129         p += nth;
1130     }
1131     else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
1132         p += nth * rb_enc_mbmaxlen(enc);
1133     }
1134     else if (rb_enc_asciicompat(enc)) {
1135         const char *p2, *e2;
1136         int n;
1137
1138         while (p < e && 0 < nth) {
1139             e2 = p + nth;
1140             if (e < e2)
1141                 return (char *)e;
1142             if (ISASCII(*p)) {
1143                 p2 = search_nonascii(p, e2);
1144                 if (!p2)
1145                     return (char *)e2;
1146                 nth -= p2 - p;
1147                 p = p2;
1148             }
1149             n = rb_enc_mbclen(p, e, enc);
1150             p += n;
1151             nth--;
1152         }
1153         if (nth != 0)
1154             return (char *)e;
1155         return (char *)p;
1156     }
1157     else {
1158         while (p<e && nth--) {
1159             p += rb_enc_mbclen(p, e, enc);
1160         }
1161     }
1162     if (p > e) p = e;
1163     return (char*)p;
1164 }
1165
1166 static char*
1167 str_nth(const char *p, const char *e, int nth, rb_encoding *enc, int singlebyte)
1168 {
1169     if (singlebyte)
1170         p += nth;
1171     else {
1172         p = rb_enc_nth(p, e, nth, enc);
1173     }
1174     if (!p) return 0;
1175     if (p > e) p = e;
1176     return (char *)p;
1177 }
1178
1179 /* char offset to byte offset */
1180 static int
1181 str_offset(const char *p, const char *e, int nth, rb_encoding *enc, int singlebyte)
1182 {
1183     const char *pp = str_nth(p, e, nth, enc, singlebyte);
1184     if (!pp) return e - p;
1185     return pp - p;
1186 }
1187
1188 #ifdef NONASCII_MASK
1189 static char *
1190 str_utf8_nth(const char *p, const char *e, int nth)
1191 {
1192     if (sizeof(VALUE) * 2 < nth) {
1193         const VALUE *s, *t;
1194         const VALUE lowbits = sizeof(VALUE) - 1;
1195         s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
1196         t = (const VALUE*)(~lowbits & (VALUE)e);
1197         while (p < (const char *)s) {
1198             if (is_utf8_lead_byte(*p)) nth--;
1199             p++;
1200         }
1201         do {
1202             nth -= count_utf8_lead_bytes_with_word(s);
1203             s++;
1204         } while (s < t && sizeof(VALUE) <= nth);
1205         p = (char *)s;
1206     }
1207     while (p < e) {
1208         if (is_utf8_lead_byte(*p)) {
1209             if (nth == 0) break;
1210             nth--;
1211         }
1212         p++;
1213     }
1214     return (char *)p;
1215 }
1216
1217 static int
1218 str_utf8_offset(const char *p, const char *e, int nth)
1219 {
1220     const char *pp = str_utf8_nth(p, e, nth);
1221     if (!pp) return e - p;
1222     return pp - p;
1223 }
1224 #endif
1225
1226 /* byte offset to char offset */
1227 long
1228 rb_str_sublen(VALUE str, long pos)
1229 {
1230     if (single_byte_optimizable(str) || pos < 0)
1231         return pos;
1232     else {
1233         char *p = RSTRING_PTR(str);
1234         return rb_enc_strlen(p, p + pos, STR_ENC_GET(str));
1235     }
1236 }
1237
1238 VALUE
1239 rb_str_subseq(VALUE str, long beg, long len)
1240 {
1241     VALUE str2 = rb_str_new5(str, RSTRING_PTR(str)+beg, len);
1242
1243     rb_enc_cr_str_copy_for_substr(str2, str);
1244     OBJ_INFECT(str2, str);
1245
1246     return str2;
1247 }
1248
1249 VALUE
1250 rb_str_substr(VALUE str, long beg, long len)
1251 {
1252     rb_encoding *enc = STR_ENC_GET(str);
1253     VALUE str2;
1254     char *p, *s = RSTRING_PTR(str), *e = s + RSTRING_LEN(str);
1255     int singlebyte;
1256
1257     if (len < 0) return Qnil;
1258     if (!RSTRING_LEN(str)) {
1259         len = 0;
1260     }
1261     if (beg < 0) {
1262         if (len > -beg) len = -beg;
1263         if (-beg * rb_enc_mbmaxlen(enc) < RSTRING_LEN(str) / 8) {
1264             beg = -beg;
1265             while (beg-- > len && (e = rb_enc_prev_char(s, e, enc)) != 0);
1266             p = e;
1267             if (!p) return Qnil;
1268             while (len-- > 0 && (p = rb_enc_prev_char(s, p, enc)) != 0);
1269             if (!p) return Qnil;
1270             len = e - p;
1271             goto sub;
1272         }
1273         else {
1274             beg += str_strlen(str, enc);
1275             if (beg < 0) return Qnil;
1276         }
1277     }
1278     else if (beg > 0 && beg > str_strlen(str, enc)) {
1279         return Qnil;
1280     }
1281     singlebyte = single_byte_optimizable(str);
1282     if (len == 0) {
1283         p = 0;
1284     }
1285 #ifdef NONASCII_MASK
1286     else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
1287         enc == rb_utf8_encoding()) {
1288         p = str_utf8_nth(s, e, beg);
1289         len = str_utf8_offset(p, e, len);
1290     }
1291 #endif
1292     else if ((p = str_nth(s, e, beg, enc, singlebyte)) == e) {
1293         len = 0;
1294     }
1295     else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
1296         if (len * rb_enc_mbmaxlen(enc) > e - p)
1297             len = e - p;
1298         else
1299             len *= rb_enc_mbmaxlen(enc);
1300     }
1301     else {
1302         len = str_offset(p, e, len, enc, singlebyte);
1303     }
1304   sub:
1305     if (len > RSTRING_EMBED_LEN_MAX && beg + len == RSTRING_LEN(str)) {
1306         str2 = rb_str_new4(str);
1307         str2 = str_new3(rb_obj_class(str2), str2);
1308         RSTRING(str2)->as.heap.ptr += RSTRING(str2)->as.heap.len - len;
1309         RSTRING(str2)->as.heap.len = len;
1310     }
1311     else {
1312         str2 = rb_str_new5(str, p, len);
1313         rb_enc_cr_str_copy_for_substr(str2, str);
1314         OBJ_INFECT(str2, str);
1315     }
1316
1317     return str2;
1318 }
1319
1320 VALUE
1321 rb_str_freeze(VALUE str)
1322 {
1323     if (STR_ASSOC_P(str)) {
1324         VALUE ary = RSTRING(str)->as.heap.aux.shared;
1325         OBJ_FREEZE(ary);
1326     }
1327     return rb_obj_freeze(str);
1328 }
1329
1330 VALUE
1331 rb_str_dup_frozen(VALUE str)
1332 {
1333     if (STR_SHARED_P(str) && RSTRING(str)->as.heap.aux.shared) {
1334         VALUE shared = RSTRING(str)->as.heap.aux.shared;
1335         if (RSTRING_LEN(shared) == RSTRING_LEN(str)) {
1336             OBJ_FREEZE(shared);
1337             return shared;
1338         }
1339     }
1340     if (OBJ_FROZEN(str)) return str;
1341     str = rb_str_dup(str);
1342     OBJ_FREEZE(str);
1343     return str;
1344 }
1345
1346 VALUE
1347 rb_str_locktmp(VALUE str)
1348 {
1349     if (FL_TEST(str, STR_TMPLOCK)) {
1350         rb_raise(rb_eRuntimeError, "temporal locking already locked string");
1351     }
1352     FL_SET(str, STR_TMPLOCK);
1353     return str;
1354 }
1355
1356 VALUE
1357 rb_str_unlocktmp(VALUE str)
1358 {
1359     if (!FL_TEST(str, STR_TMPLOCK)) {
1360         rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
1361     }
1362     FL_UNSET(str, STR_TMPLOCK);
1363     return str;
1364 }
1365
1366 void
1367 rb_str_set_len(VALUE str, long len)
1368 {
1369     STR_SET_LEN(str, len);
1370     RSTRING_PTR(str)[len] = '\0';
1371 }
1372
1373 VALUE
1374 rb_str_resize(VALUE str, long len)
1375 {
1376     long slen;
1377
1378     if (len < 0) {
1379         rb_raise(rb_eArgError, "negative string size (or size too big)");
1380     }
1381
1382     rb_str_modify(str);
1383     slen = RSTRING_LEN(str);
1384     if (len != slen) {
1385         if (STR_EMBED_P(str)) {
1386             char *ptr;
1387             if (len <= RSTRING_EMBED_LEN_MAX) {
1388                 STR_SET_EMBED_LEN(str, len);
1389                 RSTRING(str)->as.ary[len] = '\0';
1390                 return str;
1391             }
1392             ptr = ALLOC_N(char,len+1);
1393             MEMCPY(ptr, RSTRING(str)->as.ary, char, slen);
1394             RSTRING(str)->as.heap.ptr = ptr;
1395             STR_SET_NOEMBED(str);
1396         }
1397         else if (len <= RSTRING_EMBED_LEN_MAX) {
1398             char *ptr = RSTRING(str)->as.heap.ptr;
1399             STR_SET_EMBED(str);
1400             if (slen > 0) MEMCPY(RSTRING(str)->as.ary, ptr, char, len);
1401             RSTRING(str)->as.ary[len] = '\0';
1402             STR_SET_EMBED_LEN(str, len);
1403             xfree(ptr);
1404             return str;
1405         }
1406         else if (slen < len || slen - len > 1024) {
1407             REALLOC_N(RSTRING(str)->as.heap.ptr, char, len+1);
1408         }
1409         if (!STR_NOCAPA_P(str)) {
1410             RSTRING(str)->as.heap.aux.capa = len;
1411         }
1412         RSTRING(str)->as.heap.len = len;
1413         RSTRING(str)->as.heap.ptr[len] = '\0';  /* sentinel */
1414     }
1415     return str;
1416 }
1417
1418 static VALUE
1419 str_buf_cat(VALUE str, const char *ptr, long len)
1420 {
1421     long capa, total, off = -1;
1422
1423     if (ptr >= RSTRING_PTR(str) && ptr <= RSTRING_END(str)) {
1424         off = ptr - RSTRING_PTR(str);
1425     }
1426     rb_str_modify(str);
1427     if (len == 0) return 0;
1428     if (STR_ASSOC_P(str)) {
1429         FL_UNSET(str, STR_ASSOC);
1430         capa = RSTRING(str)->as.heap.aux.capa = RSTRING_LEN(str);
1431     }
1432     else if (STR_EMBED_P(str)) {
1433         capa = RSTRING_EMBED_LEN_MAX;
1434     }
1435     else {
1436         capa = RSTRING(str)->as.heap.aux.capa;
1437     }
1438     if (RSTRING_LEN(str) >= LONG_MAX - len) {
1439         rb_raise(rb_eArgError, "string sizes too big");
1440     }
1441     total = RSTRING_LEN(str)+len;
1442     if (capa <= total) {
1443         while (total > capa) {
1444             if (capa + 1 >= LONG_MAX / 2) {
1445                 capa = (total + 4095) / 4096;
1446                 break;
1447             }
1448             capa = (capa + 1) * 2;
1449         }
1450         RESIZE_CAPA(str, capa);
1451     }
1452     if (off != -1) {
1453         ptr = RSTRING_PTR(str) + off;
1454     }
1455     memcpy(RSTRING_PTR(str) + RSTRING_LEN(str), ptr, len);
1456     STR_SET_LEN(str, total);
1457     RSTRING_PTR(str)[total] = '\0'; /* sentinel */
1458
1459     return str;
1460 }
1461
1462 VALUE
1463 rb_str_buf_cat(VALUE str, const char *ptr, long len)
1464 {
1465     if (len == 0) return str;
1466     if (len < 0) {
1467         rb_raise(rb_eArgError, "negative string size (or size too big)");
1468     }
1469     return str_buf_cat(str, ptr, len);
1470 }
1471
1472 VALUE
1473 rb_str_buf_cat2(VALUE str, const char *ptr)
1474 {
1475     return rb_str_buf_cat(str, ptr, strlen(ptr));
1476 }
1477
1478 VALUE
1479 rb_str_cat(VALUE str, const char *ptr, long len)
1480 {
1481     if (len < 0) {
1482         rb_raise(rb_eArgError, "negative string size (or size too big)");
1483     }
1484     if (STR_ASSOC_P(str)) {
1485         rb_str_modify(str);
1486         if (STR_EMBED_P(str)) str_make_independent(str);
1487         REALLOC_N(RSTRING(str)->as.heap.ptr, char, RSTRING(str)->as.heap.len+len+1);
1488         memcpy(RSTRING(str)->as.heap.ptr + RSTRING(str)->as.heap.len, ptr, len);
1489         RSTRING(str)->as.heap.len += len;
1490         RSTRING(str)->as.heap.ptr[RSTRING(str)->as.heap.len] = '\0'; /* sentinel */
1491         return str;
1492     }
1493
1494     return rb_str_buf_cat(str, ptr, len);
1495 }
1496
1497 VALUE
1498 rb_str_cat2(VALUE str, const char *ptr)
1499 {
1500     return rb_str_cat(str, ptr, strlen(ptr));
1501 }
1502
1503 static VALUE
1504 rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
1505     int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
1506 {
1507     int str_encindex = ENCODING_GET(str);
1508     int res_encindex;
1509     int str_cr, res_cr;
1510     int str_a8 = ENCODING_IS_ASCII8BIT(str);
1511     int ptr_a8 = ptr_encindex == 0;
1512
1513     str_cr = ENC_CODERANGE(str);
1514
1515     if (str_encindex == ptr_encindex) {
1516         if (str_cr == ENC_CODERANGE_UNKNOWN ||
1517             (ptr_a8 && str_cr != ENC_CODERANGE_7BIT)) {
1518             ptr_cr = ENC_CODERANGE_UNKNOWN;
1519         }
1520         else if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
1521             ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
1522         }
1523     }
1524     else {
1525         rb_encoding *str_enc = rb_enc_from_index(str_encindex);
1526         rb_encoding *ptr_enc = rb_enc_from_index(ptr_encindex);
1527         if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
1528             if (len == 0)
1529                 return str;
1530             if (RSTRING_LEN(str) == 0) {
1531                 rb_str_buf_cat(str, ptr, len);
1532                 ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
1533                 return str;
1534             }
1535             goto incompatible;
1536         }
1537         if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
1538             ptr_cr = coderange_scan(ptr, len, ptr_enc);
1539         }
1540         if (str_cr == ENC_CODERANGE_UNKNOWN) {
1541             if (str_a8 || ptr_cr != ENC_CODERANGE_7BIT) {
1542                 str_cr = rb_enc_str_coderange(str);
1543             }
1544         }
1545     }
1546     if (ptr_cr_ret)
1547         *ptr_cr_ret = ptr_cr;
1548
1549     if (str_encindex != ptr_encindex &&
1550         str_cr != ENC_CODERANGE_7BIT &&
1551         ptr_cr != ENC_CODERANGE_7BIT) {
1552       incompatible:
1553         rb_raise(rb_eArgError, "append incompatible encoding strings: %s and %s",
1554             rb_enc_name(rb_enc_from_index(str_encindex)),
1555             rb_enc_name(rb_enc_from_index(ptr_encindex)));
1556     }
1557
1558     if (str_cr == ENC_CODERANGE_UNKNOWN) {
1559         res_encindex = str_encindex;
1560         res_cr = ENC_CODERANGE_UNKNOWN;
1561     }
1562     else if (str_cr == ENC_CODERANGE_7BIT) {
1563         if (ptr_cr == ENC_CODERANGE_7BIT) {
1564             res_encindex = !str_a8 ? str_encindex : ptr_encindex;
1565             res_cr = ENC_CODERANGE_7BIT;
1566         }
1567         else {
1568             res_encindex = ptr_encindex;
1569             res_cr = ptr_cr;
1570         }
1571     }
1572     else if (str_cr == ENC_CODERANGE_VALID) {
1573         res_encindex = str_encindex;
1574         res_cr = str_cr;
1575     }
1576     else { /* str_cr == ENC_CODERANGE_BROKEN */
1577         res_encindex = str_encindex;
1578         res_cr = str_cr;
1579         if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
1580     }
1581
1582     if (len < 0) {
1583         rb_raise(rb_eArgError, "negative string size (or size too big)");
1584     }
1585     str_buf_cat(str, ptr, len);
1586     ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
1587     return str;
1588 }
1589
1590 VALUE
1591 rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
1592 {
1593     return rb_enc_cr_str_buf_cat(str, ptr, len,
1594         rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL);
1595 }
1596
1597 VALUE
1598 rb_str_buf_cat_ascii(VALUE str, const char *ptr)
1599 {
1600     /* ptr must reference NUL terminated ASCII string. */
1601     int encindex = ENCODING_GET(str);
1602     rb_encoding *enc = rb_enc_from_index(encindex);
1603     if (rb_enc_asciicompat(enc)) {
1604         return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
1605             encindex, ENC_CODERANGE_7BIT, 0);
1606     }
1607     else {
1608         char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
1609         while (*ptr) {
1610             int c = (unsigned char)*ptr;
1611             int len = rb_enc_codelen(c, enc);
1612             rb_enc_mbcput(c, buf, enc);
1613             rb_enc_cr_str_buf_cat(str, buf, len,
1614                 encindex, ENC_CODERANGE_VALID, 0);
1615             ptr++;
1616         }
1617         return str;
1618     }
1619 }
1620
1621 VALUE
1622 rb_str_buf_append(VALUE str, VALUE str2)
1623 {
1624     int str2_cr;
1625
1626     str2_cr = ENC_CODERANGE(str2);
1627
1628     rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
1629         ENCODING_GET(str2), str2_cr, &str2_cr);
1630
1631     OBJ_INFECT(str, str2);
1632     ENC_CODERANGE_SET(str2, str2_cr);
1633
1634     return str;
1635 }
1636
1637 VALUE
1638 rb_str_append(VALUE str, VALUE str2)
1639 {
1640     rb_encoding *enc;
1641     int cr, cr2;
1642
1643     StringValue(str2);
1644     if (RSTRING_LEN(str2) > 0 && STR_ASSOC_P(str)) {
1645         long len = RSTRING_LEN(str)+RSTRING_LEN(str2);
1646         enc = rb_enc_check(str, str2);
1647         cr = ENC_CODERANGE(str);
1648         if ((cr2 = ENC_CODERANGE(str2)) > cr) cr = cr2;
1649         rb_str_modify(str);
1650         REALLOC_N(RSTRING(str)->as.heap.ptr, char, len+1);
1651         memcpy(RSTRING(str)->as.heap.ptr + RSTRING(str)->as.heap.len,
1652                RSTRING_PTR(str2), RSTRING_LEN(str2)+1);
1653         RSTRING(str)->as.heap.len = len;
1654         rb_enc_associate(str, enc);
1655         ENC_CODERANGE_SET(str, cr);
1656         OBJ_INFECT(str, str2);
1657         return str;
1658     }
1659     return rb_str_buf_append(str, str2);
1660 }
1661
1662
1663 /*
1664  *  call-seq:
1665  *     str << fixnum        => str
1666  *     str.concat(fixnum)   => str
1667  *     str << obj           => str
1668  *     str.concat(obj)      => str
1669  *
1670  *  Append---Concatenates the given object to <i>str</i>. If the object is a
1671  *  <code>Fixnum</code>, it is considered as a codepoint, and is converted
1672  *  to a character before concatenation.
1673  *
1674  *     a = "hello "
1675  *     a << "world"   #=> "hello world"
1676  *     a.concat(33)   #=> "hello world!"
1677  */
1678
1679 VALUE
1680 rb_str_concat(VALUE str1, VALUE str2)
1681 {
1682     if (FIXNUM_P(str2)) {
1683         rb_encoding *enc = STR_ENC_GET(str1);
1684         int c = FIX2INT(str2);
1685         int pos = RSTRING_LEN(str1);
1686         int len = rb_enc_codelen(c, enc);
1687         int cr = ENC_CODERANGE(str1);
1688
1689         rb_str_resize(str1, pos+len);
1690         rb_enc_mbcput(c, RSTRING_PTR(str1)+pos, enc);
1691         ENC_CODERANGE_SET(str1, cr);
1692         return str1;
1693     }
1694     return rb_str_append(str1, str2);
1695 }
1696
1697 #if defined __i386__ || defined _M_IX86
1698 #define UNALIGNED_WORD_ACCESS 1
1699 #endif
1700 #ifndef UNALIGNED_WORD_ACCESS
1701 #define UNALIGNED_WORD_ACCESS 0
1702 #endif
1703
1704 /* MurmurHash described in http://murmurhash.googlepages.com/ */
1705 static unsigned int
1706 hash(const unsigned char * data, int len, unsigned int h)
1707 {
1708     const unsigned int m = 0x7fd652ad;
1709     const int r = 16;
1710
1711     h += 0xdeadbeef;
1712
1713     if (len >= 4) {
1714 #if !UNALIGNED_WORD_ACCESS
1715         int align = (VALUE)data & 3;
1716         if (align) {
1717             uint32_t t = 0, d = 0;
1718             int sl, sr, pack;
1719
1720             switch (align) {
1721 #ifdef WORDS_BIGENDIAN
1722               case 1: t |= data[2];
1723               case 2: t |= data[1] << 8;
1724               case 3: t |= data[0] << 16;
1725 #else
1726               case 1: t |= data[2] << 16;
1727               case 2: t |= data[1] << 8;
1728               case 3: t |= data[0];
1729 #endif
1730             }
1731
1732 #ifdef WORDS_BIGENDIAN
1733             t >>= (8 * align) - 8;
1734 #else
1735             t <<= (8 * align);
1736 #endif
1737
1738             data += 4-align;
1739             len -= 4-align;
1740
1741             sl = 8 * (4-align);
1742             sr = 8 * align;
1743
1744             while (len >= 4) {
1745                 d = *(uint32_t *)data;
1746 #ifdef WORDS_BIGENDIAN
1747                 t = (t << sr) | (d >> sl);
1748 #else
1749                 t = (t >> sr) | (d << sl);
1750 #endif
1751                 h += t;
1752                 h *= m;
1753                 h ^= h >> r;
1754                 t = d;
1755
1756                 data += 4;
1757                 len -= 4;
1758             }
1759
1760             pack = len < align ? len : align;
1761             d = 0;
1762             switch (pack) {
1763 #ifdef WORDS_BIGENDIAN
1764               case 3: d |= data[2] << 8;
1765               case 2: d |= data[1] << 16;
1766               case 1: d |= data[0] << 24;
1767               case 0:
1768                 h += (t << sr) | (d >> sl);
1769 #else
1770               case 3: d |= data[2] << 16;
1771               case 2: d |= data[1] << 8;
1772               case 1: d |= data[0];
1773               case 0:
1774                 h += (t >> sr) | (d << sl);
1775 #endif
1776                 h *= m;
1777                 h ^= h >> r;
1778             }
1779
1780             data += pack;
1781             len -= pack;
1782         }
1783         else
1784 #endif
1785         {
1786             do {
1787                 h += *(uint32_t *)data;
1788                 h *= m;
1789                 h ^= h >> r;
1790
1791                 data += 4;
1792                 len -= 4;
1793             } while (len >= 4);
1794         }
1795     }
1796
1797     switch(len) {
1798 #ifdef WORDS_BIGENDIAN
1799       case 3:
1800         h += data[2] << 8;
1801       case 2:
1802         h += data[1] << 16;
1803       case 1:
1804         h += data[0] << 24;
1805 #else
1806       case 3:
1807         h += data[2] << 16;
1808       case 2:
1809         h += data[1] << 8;
1810       case 1:
1811         h += data[0];
1812 #endif
1813         h *= m;
1814         h ^= h >> r;
1815     }
1816
1817     h *= m;
1818     h ^= h >> 10;
1819     h *= m;
1820     h ^= h >> 17;
1821
1822     return h;
1823 }
1824
1825 int
1826 rb_memhash(const void *ptr, long len)
1827 {
1828     static int hashseed_init = 0;
1829     static unsigned int hashseed;
1830
1831     if (!hashseed_init) {
1832         hashseed = rb_genrand_int32();
1833         hashseed_init = 1;
1834     }
1835
1836     return hash(ptr, len, hashseed);
1837 }
1838
1839 int
1840 rb_str_hash(VALUE str)
1841 {
1842     return rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str));
1843 }
1844
1845 int
1846 rb_str_hash_cmp(VALUE str1, VALUE str2)
1847 {
1848     int len;
1849
1850     if (!rb_str_comparable(str1, str2)) return 1;
1851     if (RSTRING_LEN(str1) == (len = RSTRING_LEN(str2)) &&
1852         memcmp(RSTRING_PTR(str1), RSTRING_PTR(str2), len) == 0) {
1853         return 0;
1854     }
1855     return 1;
1856 }
1857
1858 /*
1859  * call-seq:
1860  *    str.hash   => fixnum
1861  *
1862  * Return a hash based on the string's length and content.
1863  */
1864
1865 static VALUE
1866 rb_str_hash_m(VALUE str)
1867 {
1868     int hval = rb_str_hash(str);
1869     return INT2FIX(hval);
1870 }
1871
1872 #define lesser(a,b) (((a)>(b))?(b):(a))
1873
1874 int
1875 rb_str_comparable(VALUE str1, VALUE str2)
1876 {
1877     int idx1, idx2;
1878     int rc1, rc2;
1879
1880     if (RSTRING_LEN(str1) == 0) return Qtrue;
1881     if (RSTRING_LEN(str2) == 0) return Qtrue;
1882     idx1 = ENCODING_GET(str1);
1883     idx2 = ENCODING_GET(str2);
1884     if (idx1 == idx2) return Qtrue;
1885     rc1 = rb_enc_str_coderange(str1);
1886     rc2 = rb_enc_str_coderange(str2);
1887     if (rc1 == ENC_CODERANGE_7BIT) {
1888         if (rc2 == ENC_CODERANGE_7BIT) return Qtrue;
1889         if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
1890             return Qtrue;
1891     }
1892     if (rc2 == ENC_CODERANGE_7BIT) {
1893         if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
1894             return Qtrue;
1895     }
1896     return Qfalse;
1897 }
1898
1899 int
1900 rb_str_cmp(VALUE str1, VALUE str2)
1901 {
1902     long len;
1903     int retval;
1904
1905     len = lesser(RSTRING_LEN(str1), RSTRING_LEN(str2));
1906     retval = memcmp(RSTRING_PTR(str1), RSTRING_PTR(str2), len);
1907     if (retval == 0) {
1908         if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) {
1909             if (!rb_enc_compatible(str1, str2)) {
1910                 if (ENCODING_GET(str1) - ENCODING_GET(str2) > 0)
1911                     return 1;
1912                 return -1;
1913             }
1914             return 0;
1915         }
1916         if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return 1;
1917         return -1;
1918     }
1919     if (retval > 0) return 1;
1920     return -1;
1921 }
1922
1923
1924 /*
1925  *  call-seq:
1926  *     str == obj   => true or false
1927  *
1928  *  Equality---If <i>obj</i> is not a <code>String</code>, returns
1929  *  <code>false</code>. Otherwise, returns <code>true</code> if <i>str</i>
1930  *  <code><=></code> <i>obj</i> returns zero.
1931  */
1932
1933 VALUE
1934 rb_str_equal(VALUE str1, VALUE str2)
1935 {
1936     int len;
1937
1938     if (str1 == str2) return Qtrue;
1939     if (TYPE(str2) != T_STRING) {
1940         if (!rb_respond_to(str2, rb_intern("to_str"))) {
1941             return Qfalse;
1942         }
1943         return rb_equal(str2, str1);
1944     }
1945     if (!rb_str_comparable(str1, str2)) return Qfalse;
1946     if (RSTRING_LEN(str1) == (len = RSTRING_LEN(str2)) &&
1947         memcmp(RSTRING_PTR(str1), RSTRING_PTR(str2), len) == 0) {
1948         return Qtrue;
1949     }
1950     return Qfalse;
1951 }
1952
1953 /*
1954  * call-seq:
1955  *   str.eql?(other)   => true or false
1956  *
1957  * Two strings are equal if the have the same length and content.
1958  */
1959
1960 static VALUE
1961 rb_str_eql(VALUE str1, VALUE str2)
1962 {
1963     if (TYPE(str2) != T_STRING || RSTRING_LEN(str1) != RSTRING_LEN(str2))
1964         return Qfalse;
1965
1966     if (!rb_str_comparable(str1, str2)) return Qfalse;
1967     if (memcmp(RSTRING_PTR(str1), RSTRING_PTR(str2),
1968                lesser(RSTRING_LEN(str1), RSTRING_LEN(str2))) == 0)
1969         return Qtrue;
1970
1971     return Qfalse;
1972 }
1973
1974 /*
1975  *  call-seq:
1976  *     str <=> other_str   => -1, 0, +1
1977  *
1978  *  Comparison---Returns -1 if <i>other_str</i> is less than, 0 if
1979  *  <i>other_str</i> is equal to, and +1 if <i>other_str</i> is greater than
1980  *  <i>str</i>. If the strings are of different lengths, and the strings are
1981  *  equal when compared up to the shortest length, then the longer string is
1982  *  considered greater than the shorter one. In older versions of Ruby, setting
1983  *  <code>$=</code> allowed case-insensitive comparisons; this is now deprecated
1984  *  in favor of using <code>String#casecmp</code>.
1985  *
1986  *  <code><=></code> is the basis for the methods <code><</code>,
1987  *  <code><=</code>, <code>></code>, <code>>=</code>, and <code>between?</code>,
1988  *  included from module <code>Comparable</code>.  The method
1989  *  <code>String#==</code> does not use <code>Comparable#==</code>.
1990  *
1991  *     "abcdef" <=> "abcde"     #=> 1
1992  *     "abcdef" <=> "abcdef"    #=> 0
1993  *     "abcdef" <=> "abcdefg"   #=> -1
1994  *     "abcdef" <=> "ABCDEF"    #=> 1
1995  */
1996
1997 static VALUE
1998 rb_str_cmp_m(VALUE str1, VALUE str2)
1999 {
2000     long result;
2001
2002     if (TYPE(str2) != T_STRING) {
2003         if (!rb_respond_to(str2, rb_intern("to_str"))) {
2004             return Qnil;
2005         }
2006         else if (!rb_respond_to(str2, rb_intern("<=>"))) {
2007             return Qnil;
2008         }
2009         else {
2010             VALUE tmp = rb_funcall(str2, rb_intern("<=>"), 1, str1);
2011
2012             if (NIL_P(tmp)) return Qnil;
2013             if (!FIXNUM_P(tmp)) {
2014                 return rb_funcall(LONG2FIX(0), '-', 1, tmp);
2015             }
2016             result = -FIX2LONG(tmp);
2017         }
2018     }
2019     else {
2020         result = rb_str_cmp(str1, str2);
2021     }
2022     return LONG2NUM(result);
2023 }
2024
2025 /*
2026  *  call-seq:
2027  *     str.casecmp(other_str)   => -1, 0, +1
2028  *
2029  *  Case-insensitive version of <code>String#<=></code>.
2030  *
2031  *     "abcdef".casecmp("abcde")     #=> 1
2032  *     "aBcDeF".casecmp("abcdef")    #=> 0
2033  *     "abcdef".casecmp("abcdefg")   #=> -1
2034  *     "abcdef".casecmp("ABCDEF")    #=> 0
2035  */
2036
2037 static VALUE
2038 rb_str_casecmp(VALUE str1, VALUE str2)
2039 {
2040     long len;
2041     rb_encoding *enc;
2042     char *p1, *p1end, *p2, *p2end;
2043
2044     StringValue(str2);
2045     enc = rb_enc_compatible(str1, str2);
2046     if (!enc) {
2047         return Qnil;
2048     }
2049
2050     p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
2051     p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
2052     while (p1 < p1end && p2 < p2end) {
2053         int c1 = rb_enc_codepoint(p1, p1end, enc);
2054         int c2 = rb_enc_codepoint(p2, p2end, enc);
2055
2056         if (c1 != c2) {
2057             c1 = rb_enc_toupper(c1, enc);
2058             c2 = rb_enc_toupper(c2, enc);
2059             if (c1 > c2) return INT2FIX(1);
2060             if (c1 < c2) return INT2FIX(-1);
2061         }
2062         len = rb_enc_codelen(c1, enc);
2063         p1 += len;
2064         p2 += len;
2065     }
2066     if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0);
2067     if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1);
2068     return INT2FIX(-1);
2069 }
2070
2071 static long
2072 rb_str_index(VALUE str, VALUE sub, long offset)
2073 {
2074     long pos;
2075     char *s, *sptr;
2076     long len, slen;
2077     rb_encoding *enc;
2078
2079     enc = rb_enc_check(str, sub);
2080     if (is_broken_string(sub)) {
2081         return -1;
2082     }
2083     len = str_strlen(str, enc);
2084     slen = str_strlen(sub, enc);
2085     if (offset < 0) {
2086         offset += len;
2087         if (offset < 0) return -1;
2088     }
2089     if (len - offset < slen) return -1;
2090     s = RSTRING_PTR(str);
2091     if (offset) {
2092         offset = str_offset(s, RSTRING_END(str), offset, enc, single_byte_optimizable(str));
2093         s += offset;
2094     }
2095     if (slen == 0) return offset;
2096     /* need proceed one character at a time */
2097     sptr = RSTRING_PTR(sub);
2098     slen = RSTRING_LEN(sub);
2099     len = RSTRING_LEN(str) - offset;
2100     for (;;) {
2101         char *t;
2102         pos = rb_memsearch(sptr, slen, s, len, enc);
2103         if (pos < 0) return pos;
2104         t = rb_enc_right_char_head(s, s+pos, enc);
2105         if (t == s + pos) break;
2106         if ((len -= t - s) <= 0) return -1;
2107         offset += t - s;
2108         s = t;
2109     }
2110     return pos + offset;
2111 }
2112
2113
2114 /*
2115  *  call-seq:
2116  *     str.index(substring [, offset])   => fixnum or nil
2117  *     str.index(regexp [, offset])      => fixnum or nil
2118  *
2119  *  Returns the index of the first occurrence of the given <i>substring</i> or
2120  *  pattern (<i>regexp</i>) in <i>str</i>. Returns <code>nil</code> if not
2121  *  found. If the second parameter is present, it specifies the position in the
2122  *  string to begin the search.
2123  *
2124  *     "hello".index('e')             #=> 1
2125  *     "hello".index('lo')            #=> 3
2126  *     "hello".index('a')             #=> nil
2127  *     "hello".index(?e)              #=> 1
2128  *     "hello".index(/[aeiou]/, -3)   #=> 4
2129  */
2130
2131 static VALUE
2132 rb_str_index_m(int argc, VALUE *argv, VALUE str)
2133 {
2134     VALUE sub;
2135     VALUE initpos;
2136     long pos;
2137
2138     if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
2139         pos = NUM2LONG(initpos);
2140     }
2141     else {
2142         pos = 0;
2143     }
2144     if (pos < 0) {
2145         pos += str_strlen(str, STR_ENC_GET(str));
2146         if (pos < 0) {
2147             if (TYPE(sub) == T_REGEXP) {
2148                 rb_backref_set(Qnil);
2149             }
2150             return Qnil;
2151         }
2152     }
2153
2154     switch (TYPE(sub)) {
2155       case T_REGEXP:
2156         pos = rb_reg_adjust_startpos(sub, str, pos, 0);
2157         pos = rb_reg_search(sub, str, pos, 0);
2158         pos = rb_str_sublen(str, pos);
2159         break;
2160
2161       default: {
2162         VALUE tmp;
2163
2164         tmp = rb_check_string_type(sub);
2165         if (NIL_P(tmp)) {
2166             rb_raise(rb_eTypeError, "type mismatch: %s given",
2167                      rb_obj_classname(sub));
2168         }
2169         sub = tmp;
2170       }
2171         /* fall through */
2172       case T_STRING:
2173         pos = rb_str_index(str, sub, pos);
2174         pos = rb_str_sublen(str, pos);
2175         break;
2176     }
2177
2178     if (pos == -1) return Qnil;
2179     return LONG2NUM(pos);
2180 }
2181
2182 static long
2183 rb_str_rindex(VALUE str, VALUE sub, long pos)
2184 {
2185     long len, slen;
2186     char *s, *sbeg, *e, *t;
2187     rb_encoding *enc;
2188     int singlebyte = single_byte_optimizable(str);
2189
2190     enc = rb_enc_check(str, sub);
2191     if (is_broken_string(sub)) {
2192         return -1;
2193     }
2194     len = str_strlen(str, enc);
2195     slen = str_strlen(sub, enc);
2196     /* substring longer than string */
2197     if (len < slen) return -1;
2198     if (len - pos < slen) {
2199         pos = len - slen;
2200     }
2201     if (len == 0) {
2202         return pos;
2203     }
2204     sbeg = RSTRING_PTR(str);
2205     e = RSTRING_END(str);
2206     t = RSTRING_PTR(sub);
2207     slen = RSTRING_LEN(sub);
2208     for (;;) {
2209         s = str_nth(sbeg, e, pos, enc, singlebyte);
2210         if (!s) return -1;
2211         if (memcmp(s, t, slen) == 0) {
2212             return pos;
2213         }
2214         if (pos == 0) break;
2215         pos--;
2216     }
2217     return -1;
2218 }
2219
2220
2221 /*
2222  *  call-seq:
2223  *     str.rindex(substring [, fixnum])   => fixnum or nil
2224  *     str.rindex(regexp [, fixnum])   => fixnum or nil
2225  *
2226  *  Returns the index of the last occurrence of the given <i>substring</i> or
2227  *  pattern (<i>regexp</i>) in <i>str</i>. Returns <code>nil</code> if not
2228  *  found. If the second parameter is present, it specifies the position in the
2229  *  string to end the search---characters beyond this point will not be
2230  *  considered.
2231  *
2232  *     "hello".rindex('e')             #=> 1
2233  *     "hello".rindex('l')             #=> 3
2234  *     "hello".rindex('a')             #=> nil
2235  *     "hello".rindex(?e)              #=> 1
2236  *     "hello".rindex(/[aeiou]/, -2)   #=> 1
2237  */
2238
2239 static VALUE
2240 rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
2241 {
2242     VALUE sub;
2243     VALUE vpos;
2244     rb_encoding *enc = STR_ENC_GET(str);
2245     long pos, len = str_strlen(str, enc);
2246
2247     if (rb_scan_args(argc, argv, "11", &sub, &vpos) == 2) {
2248         pos = NUM2LONG(vpos);
2249         if (pos < 0) {
2250             pos += len;
2251             if (pos < 0) {
2252                 if (TYPE(sub) == T_REGEXP) {
2253                     rb_backref_set(Qnil);
2254                 }
2255                 return Qnil;
2256             }
2257         }
2258         if (pos > len) pos = len;
2259     }
2260     else {
2261         pos = len;
2262     }
2263
2264     switch (TYPE(sub)) {
2265       case T_REGEXP:
2266         /* enc = rb_get_check(str, sub); */
2267         if (!RREGEXP(sub)->ptr || RREGEXP_SRC_LEN(sub)) {
2268             pos = rb_reg_adjust_startpos(sub, str, pos, 1);
2269             pos = rb_reg_search(sub, str, pos, 1);
2270             pos = rb_str_sublen(str, pos);
2271         }
2272         if (pos >= 0) return LONG2NUM(pos);
2273         break;
2274
2275       default: {
2276         VALUE tmp;
2277
2278         tmp = rb_check_string_type(sub);
2279         if (NIL_P(tmp)) {
2280             rb_raise(rb_eTypeError, "type mismatch: %s given",
2281                      rb_obj_classname(sub));
2282         }
2283         sub = tmp;
2284       }
2285         /* fall through */
2286       case T_STRING:
2287         pos = rb_str_rindex(str, sub, pos);
2288         if (pos >= 0) return LONG2NUM(pos);
2289         break;
2290     }
2291     return Qnil;
2292 }
2293
2294 /*
2295  *  call-seq:
2296  *     str =~ obj   => fixnum or nil
2297  *
2298  *  Match---If <i>obj</i> is a <code>Regexp</code>, use it as a pattern to match
2299  *  against <i>str</i>,and returns the position the match starts, or
2300  *  <code>nil</code> if there is no match. Otherwise, invokes
2301  *  <i>obj.=~</i>, passing <i>str</i> as an argument. The default
2302  *  <code>=~</code> in <code>Object</code> returns <code>false</code>.
2303  *
2304  *     "cat o' 9 tails" =~ /\d/   #=> 7
2305  *     "cat o' 9 tails" =~ 9      #=> nil
2306  */
2307
2308 static VALUE
2309 rb_str_match(VALUE x, VALUE y)
2310 {
2311     switch (TYPE(y)) {
2312       case T_STRING:
2313         rb_raise(rb_eTypeError, "type mismatch: String given");
2314
2315       case T_REGEXP:
2316         return rb_reg_match(y, x);
2317
2318       default:
2319         return rb_funcall(y, rb_intern("=~"), 1, x);
2320     }
2321 }
2322
2323
2324 static VALUE get_pat(VALUE, int);
2325
2326
2327 /*
2328  *  call-seq:
2329  *     str.match(pattern)   => matchdata or nil
2330  *
2331  *  Converts <i>pattern</i> to a <code>Regexp</code> (if it isn't already one),
2332  *  then invokes its <code>match</code> method on <i>str</i>.  If the second
2333  *  parameter is present, it specifies the position in the string to begin the
2334  *  search.
2335  *
2336  *     'hello'.match('(.)\1')      #=> #<MatchData "ll" 1:"l">
2337  *     'hello'.match('(.)\1')[0]   #=> "ll"
2338  *     'hello'.match(/(.)\1/)[0]   #=> "ll"
2339  *     'hello'.match('xx')         #=> nil
2340  *
2341  *  If a block is given, invoke the block with MatchData if match succeed, so
2342  *  that you can write
2343  *
2344  *     str.match(pat) {|m| ...}
2345  *
2346  *  instead of
2347  *
2348  *     if m = str.match(pat)
2349  *       ...
2350  *     end
2351  *
2352  *  The return value is a value from block execution in this case.
2353  */
2354
2355 static VALUE
2356 rb_str_match_m(int argc, VALUE *argv, VALUE str)
2357 {
2358     VALUE re, result;
2359     if (argc < 1)
2360         rb_raise(rb_eArgError, "wrong number of arguments (%d for 1)", argc);
2361     re = argv[0];
2362     argv[0] = str;
2363     result = rb_funcall2(get_pat(re, 0), rb_intern("match"), argc, argv);
2364     if (!NIL_P(result) && rb_block_given_p()) {
2365         return rb_yield(result);
2366     }
2367     return result;
2368 }
2369
2370 enum neighbor_char {
2371     NEIGHBOR_NOT_CHAR,
2372     NEIGHBOR_FOUND,
2373     NEIGHBOR_WRAPPED
2374 };
2375
2376 static enum neighbor_char
2377 enc_succ_char(char *p, int len, rb_encoding *enc)
2378 {
2379     int i, l;
2380     while (1) {
2381         for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
2382             p[i] = '\0';
2383         if (i < 0)
2384             return NEIGHBOR_WRAPPED;
2385         ++((unsigned char*)p)[i];
2386         l = rb_enc_precise_mbclen(p, p+len, enc);
2387         if (MBCLEN_CHARFOUND_P(l)) {
2388             l = MBCLEN_CHARFOUND_LEN(l);
2389             if (l == len) {
2390                 return NEIGHBOR_FOUND;
2391             }
2392             else {
2393                 memset(p+l, 0xff, len-l);
2394             }
2395         }
2396         if (MBCLEN_INVALID_P(l) && i < len-1) {
2397             int len2, l2;
2398             for (len2 = len-1; 0 < len2; len2--) {
2399                 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
2400                 if (!MBCLEN_INVALID_P(l2))
2401                     break;
2402             }
2403             memset(p+len2+1, 0xff, len-(len2+1));
2404         }
2405     }
2406 }
2407
2408 static enum neighbor_char
2409 enc_pred_char(char *p, int len, rb_encoding *enc)
2410 {
2411     int i, l;
2412     while (1) {
2413         for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
2414             p[i] = '\xff';
2415         if (i < 0)
2416             return NEIGHBOR_WRAPPED;
2417         --((unsigned char*)p)[i];
2418         l = rb_enc_precise_mbclen(p, p+len, enc);
2419         if (MBCLEN_CHARFOUND_P(l)) {
2420             l = MBCLEN_CHARFOUND_LEN(l);
2421             if (l == len) {
2422                 return NEIGHBOR_FOUND;
2423             }
2424             else {
2425                 memset(p+l, 0, len-l);
2426             }
2427         }
2428         if (MBCLEN_INVALID_P(l) && i < len-1) {
2429             int len2, l2;
2430             for (len2 = len-1; 0 < len2; len2--) {
2431                 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
2432                 if (!MBCLEN_INVALID_P(l2))
2433                     break;
2434             }
2435             memset(p+len2+1, 0, len-(len2+1));
2436         }
2437     }
2438 }
2439
2440 /*
2441   overwrite +p+ by succeeding letter in +enc+ and returns
2442   NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
2443   When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
2444   assuming each ranges are successive, and mbclen
2445   never change in each ranges.
2446   NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
2447   character.
2448  */
2449 static enum neighbor_char
2450 enc_succ_alnum_char(char *p, int len, rb_encoding *enc, char *carry)
2451 {
2452     enum neighbor_char ret;
2453     int c;
2454     int ctype;
2455     int range;
2456     char save[ONIGENC_CODE_TO_MBC_MAXLEN];
2457
2458     c = rb_enc_mbc_to_codepoint(p, p+len, enc);
2459     if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
2460         ctype = ONIGENC_CTYPE_DIGIT;
2461     else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
2462         ctype = ONIGENC_CTYPE_ALPHA;
2463     else
2464         return NEIGHBOR_NOT_CHAR;
2465
2466     MEMCPY(save, p, char, len);
2467     ret = enc_succ_char(p, len, enc);
2468     if (ret == NEIGHBOR_FOUND) {
2469         c = rb_enc_mbc_to_codepoint(p, p+len, enc);
2470         if (rb_enc_isctype(c, ctype, enc))
2471             return NEIGHBOR_FOUND;
2472     }
2473     MEMCPY(p, save, char, len);
2474     range = 1;
2475     while (1) {
2476         MEMCPY(save, p, char, len);
2477         ret = enc_pred_char(p, len, enc);
2478         if (ret == NEIGHBOR_FOUND) {
2479             c = rb_enc_mbc_to_codepoint(p, p+len, enc);
2480             if (!rb_enc_isctype(c, ctype, enc)) {
2481                 MEMCPY(p, save, char, len);
2482                 break;
2483             }
2484         }
2485         else {
2486             MEMCPY(p, save, char, len);
2487             break;
2488         }
2489         range++;
2490     }
2491     if (range == 1) {
2492         return NEIGHBOR_NOT_CHAR;
2493     }
2494
2495     if (ctype != ONIGENC_CTYPE_DIGIT) {
2496         MEMCPY(carry, p, char, len);
2497         return NEIGHBOR_WRAPPED;
2498     }
2499
2500     MEMCPY(carry, p, char, len);
2501     enc_succ_char(carry, len, enc);
2502     return NEIGHBOR_WRAPPED;
2503 }
2504
2505
2506 /*
2507  *  call-seq:
2508  *     str.succ   => new_str
2509  *     str.next   => new_str
2510  *
2511  *  Returns the successor to <i>str</i>. The successor is calculated by
2512  *  incrementing characters starting from the rightmost alphanumeric (or
2513  *  the rightmost character if there are no alphanumerics) in the
2514  *  string. Incrementing a digit always results in another digit, and
2515  *  incrementing a letter results in another letter of the same case.
2516  *  Incrementing nonalphanumerics uses the underlying character set's
2517  *  collating sequence.
2518  *
2519  *  If the increment generates a ``carry,'' the character to the left of
2520  *  it is incremented. This process repeats until there is no carry,
2521  *  adding an additional character if necessary.
2522  *
2523  *     "abcd".succ        #=> "abce"
2524  *     "THX1138".succ     #=> "THX1139"
2525  *     "<<koala>>".succ   #=> "<<koalb>>"
2526  *     "1999zzz".succ     #=> "2000aaa"
2527  *     "ZZZ9999".succ     #=> "AAAA0000"
2528  *     "***".succ         #=> "**+"
2529  */
2530
2531 VALUE
2532 rb_str_succ(VALUE orig)
2533 {
2534     rb_encoding *enc;
2535     VALUE str;
2536     char *sbeg, *s, *e, *last_alnum = 0;
2537     int c = -1;
2538     long l;
2539     char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
2540     int carry_pos = 0, carry_len = 1;
2541     enum neighbor_char neighbor = NEIGHBOR_FOUND;
2542
2543     str = rb_str_new5(orig, RSTRING_PTR(orig), RSTRING_LEN(orig));
2544     rb_enc_cr_str_copy_for_substr(str, orig);
2545     OBJ_INFECT(str, orig);
2546     if (RSTRING_LEN(str) == 0) return str;
2547
2548     enc = STR_ENC_GET(orig);
2549     sbeg = RSTRING_PTR(str);
2550     s = e = sbeg + RSTRING_LEN(str);
2551
2552     while ((s = rb_enc_prev_char(sbeg, s, enc)) != 0) {
2553         if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
2554             if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
2555                 ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
2556                 s = last_alnum;
2557                 break;
2558             }
2559         }
2560         if ((l = rb_enc_precise_mbclen(s, e, enc)) <= 0) continue;
2561         neighbor = enc_succ_alnum_char(s, l, enc, carry);
2562         switch (neighbor) {
2563           case NEIGHBOR_NOT_CHAR:
2564             continue;
2565           case NEIGHBOR_FOUND:
2566             return str;
2567           case NEIGHBOR_WRAPPED:
2568             last_alnum = s;
2569             break;
2570         }
2571         c = 1;
2572         carry_pos = s - sbeg;
2573         carry_len = l;
2574     }
2575     if (c == -1) {              /* str contains no alnum */
2576         s = e;
2577         while ((s = rb_enc_prev_char(sbeg, s, enc)) != 0) {
2578             enum neighbor_char neighbor;
2579             if ((l = rb_enc_precise_mbclen(s, e, enc)) <= 0) continue;
2580             neighbor = enc_succ_char(s, l, enc);
2581             if (neighbor == NEIGHBOR_FOUND)
2582                 return str;
2583             if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
2584                 /* wrapped to \0...\0.  search next valid char. */
2585                 enc_succ_char(s, l, enc);
2586             }
2587             if (!rb_enc_asciicompat(enc)) {
2588                 MEMCPY(carry, s, char, l);
2589                 carry_len = l;
2590             }
2591             carry_pos = s - sbeg;
2592         }
2593     }
2594     RESIZE_CAPA(str, RSTRING_LEN(str) + carry_len);
2595     s = RSTRING_PTR(str) + carry_pos;
2596     memmove(s + carry_len, s, RSTRING_LEN(str) - carry_pos);
2597     memmove(s, carry, carry_len);
2598     STR_SET_LEN(str, RSTRING_LEN(str) + carry_len);
2599     RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
2600     rb_enc_str_coderange(str);
2601     return str;
2602 }
2603
2604
2605 /*
2606  *  call-seq:
2607  *     str.succ!   => str
2608  *     str.next!   => str
2609  *
2610  *  Equivalent to <code>String#succ</code>, but modifies the receiver in
2611  *  place.
2612  */
2613
2614 static VALUE
2615 rb_str_succ_bang(VALUE str)
2616 {
2617     rb_str_shared_replace(str, rb_str_succ(str));
2618
2619     return str;
2620 }
2621
2622
2623 /*
2624  *  call-seq:
2625  *     str.upto(other_str, exclusive=false) {|s| block }   => str
2626  *
2627  *  Iterates through successive values, starting at <i>str</i> and
2628  *  ending at <i>other_str</i> inclusive, passing each value in turn to
2629  *  the block. The <code>String#succ</code> method is used to generate
2630  *  each value.  If optional second argument exclusive is omitted or is <code>false</code>,
2631  *  the last value will be included; otherwise it will be excluded.
2632  *
2633  *     "a8".upto("b6") {|s| print s, ' ' }
2634  *     for s in "a8".."b6"
2635  *       print s, ' '
2636  *     end
2637  *
2638  *  <em>produces:</em>
2639  *
2640  *     a8 a9 b0 b1 b2 b3 b4 b5 b6
2641  *     a8 a9 b0 b1 b2 b3 b4 b5 b6
2642  */
2643
2644 static VALUE
2645 rb_str_upto(int argc, VALUE *argv, VALUE beg)
2646 {
2647     VALUE end, exclusive;
2648     VALUE current, after_end;
2649     ID succ;
2650     int n, excl;
2651     rb_encoding *enc;
2652
2653     rb_scan_args(argc, argv, "11", &end, &exclusive);
2654     excl = RTEST(exclusive);
2655     CONST_ID(succ, "succ");
2656     StringValue(end);
2657     enc = rb_enc_check(beg, end);
2658     if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 &&
2659         is_ascii_string(beg) && is_ascii_string(end)) {
2660         char c = RSTRING_PTR(beg)[0];
2661         char e = RSTRING_PTR(end)[0];
2662
2663         if (c > e || (excl && c == e)) return beg;
2664         for (;;) {
2665             rb_yield(rb_enc_str_new(&c, 1, enc));
2666             if (!excl && c == e) break;
2667             c++;
2668             if (excl && c == e) break;
2669         }
2670         return beg;
2671     }
2672     n = rb_str_cmp(beg, end);
2673     if (n > 0 || (excl && n == 0)) return beg;
2674
2675     after_end = rb_funcall(end, succ, 0, 0);
2676     current = beg;
2677     while (!rb_str_equal(current, after_end)) {
2678         rb_yield(current);
2679         if (!excl && rb_str_equal(current, end)) break;
2680         current = rb_funcall(current, succ, 0, 0);
2681         StringValue(current);
2682         if (excl && rb_str_equal(current, end)) break;
2683         if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
2684             break;
2685     }
2686
2687     return beg;
2688 }
2689
2690 static VALUE
2691 rb_str_subpat(VALUE str, VALUE re, int nth)
2692 {
2693     if (rb_reg_search(re, str, 0, 0) >= 0) {
2694         return rb_reg_nth_match(nth, rb_backref_get());
2695     }
2696     return Qnil;
2697 }
2698
2699 static VALUE
2700 rb_str_aref(VALUE str, VALUE indx)
2701 {
2702     long idx;
2703
2704     switch (TYPE(indx)) {
2705       case T_FIXNUM:
2706         idx = FIX2LONG(indx);
2707
2708       num_index:
2709         str = rb_str_substr(str, idx, 1);
2710         if (!NIL_P(str) && RSTRING_LEN(str) == 0) return Qnil;
2711         return str;
2712
2713       case T_REGEXP:
2714         return rb_str_subpat(str, indx, 0);
2715
2716       case T_STRING:
2717         if (rb_str_index(str, indx, 0) != -1)
2718             return rb_str_dup(indx);
2719         return Qnil;
2720
2721       default:
2722         /* check if indx is Range */
2723         {
2724             long beg, len;
2725             VALUE tmp;
2726
2727             len = str_strlen(str, STR_ENC_GET(str));
2728             switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
2729               case Qfalse:
2730                 break;
2731               case Qnil:
2732                 return Qnil;
2733               default:
2734                 tmp = rb_str_substr(str, beg, len);
2735                 return tmp;
2736             }
2737         }
2738         idx = NUM2LONG(indx);
2739         goto num_index;
2740     }
2741     return Qnil;                /* not reached */
2742 }
2743
2744
2745 /*
2746  *  call-seq:
2747  *     str[fixnum]                 => new_str or nil
2748  *     str[fixnum, fixnum]         => new_str or nil
2749  *     str[range]                  => new_str or nil
2750  *     str[regexp]                 => new_str or nil
2751  *     str[regexp, fixnum]         => new_str or nil
2752  *     str[other_str]              => new_str or nil
2753  *     str.slice(fixnum)           => new_str or nil
2754  *     str.slice(fixnum, fixnum)   => new_str or nil
2755  *     str.slice(range)            => new_str or nil
2756  *     str.slice(regexp)           => new_str or nil
2757  *     str.slice(regexp, fixnum)   => new_str or nil
2758  *     str.slice(other_str)        => new_str or nil
2759  *
2760  *  Element Reference---If passed a single <code>Fixnum</code>, returns a
2761  *  substring of one character at that position. If passed two <code>Fixnum</code>
2762  *  objects, returns a substring starting at the offset given by the first, and
2763  *  a length given by the second. If given a range, a substring containing
2764  *  characters at offsets given by the range is returned. In all three cases, if
2765  *  an offset is negative, it is counted from the end of <i>str</i>. Returns
2766  *  <code>nil</code> if the initial offset falls outside the string, the length
2767  *  is negative, or the beginning of the range is greater than the end.
2768  *
2769  *  If a <code>Regexp</code> is supplied, the matching portion of <i>str</i> is
2770  *  returned. If a numeric parameter follows the regular expression, that
2771  *  component of the <code>MatchData</code> is returned instead. If a
2772  *  <code>String</code> is given, that string is returned if it occurs in
2773  *  <i>str</i>. In both cases, <code>nil</code> is returned if there is no
2774  *  match.
2775  *
2776  *     a = "hello there"
2777  *     a[1]                   #=> "e"
2778  *     a[1,3]                 #=> "ell"
2779  *     a[1..3]                #=> "ell"
2780  *     a[-3,2]                #=> "er"
2781  *     a[-4..-2]              #=> "her"
2782  *     a[12..-1]              #=> nil
2783  *     a[-2..-4]              #=> ""
2784  *     a[/[aeiou](.)\1/]      #=> "ell"
2785  *     a[/[aeiou](.)\1/, 0]   #=> "ell"
2786  *     a[/[aeiou](.)\1/, 1]   #=> "l"
2787  *     a[/[aeiou](.)\1/, 2]   #=> nil
2788  *     a["lo"]                #=> "lo"
2789  *     a["bye"]               #=> nil
2790  */
2791
2792 static VALUE
2793 rb_str_aref_m(int argc, VALUE *argv, VALUE str)
2794 {
2795     if (argc == 2) {
2796         if (TYPE(argv[0]) == T_REGEXP) {
2797             return rb_str_subpat(str, argv[0], NUM2INT(argv[1]));
2798         }
2799         return rb_str_substr(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]));
2800     }
2801     if (argc != 1) {
2802         rb_raise(rb_eArgError, "wrong number of arguments (%d for 1)", argc);
2803     }
2804     return rb_str_aref(str, argv[0]);
2805 }
2806
2807 static void
2808 rb_str_splice_0(VALUE str, long beg, long len, VALUE val)
2809 {
2810     rb_str_modify(str);
2811     if (len < RSTRING_LEN(val)) {
2812         /* expand string */
2813         RESIZE_CAPA(str, RSTRING_LEN(str) + RSTRING_LEN(val) - len + 1);
2814     }
2815
2816     if (RSTRING_LEN(val) != len) {
2817         memmove(RSTRING_PTR(str) + beg + RSTRING_LEN(val),
2818                 RSTRING_PTR(str) + beg + len,
2819                 RSTRING_LEN(str) - (beg + len));
2820     }
2821     if (RSTRING_LEN(val) < beg && len < 0) {
2822         MEMZERO(RSTRING_PTR(str) + RSTRING_LEN(str), char, -len);
2823     }
2824     if (RSTRING_LEN(val) > 0) {
2825         memmove(RSTRING_PTR(str)+beg, RSTRING_PTR(val), RSTRING_LEN(val));
2826     }
2827     STR_SET_LEN(str, RSTRING_LEN(str) + RSTRING_LEN(val) - len);
2828     if (RSTRING_PTR(str)) {
2829         RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
2830     }
2831     OBJ_INFECT(str, val);
2832 }
2833
2834 static void
2835 rb_str_splice(VALUE str, long beg, long len, VALUE val)
2836 {
2837     long slen;
2838     char *p, *e;
2839     rb_encoding *enc;
2840     int singlebyte = single_byte_optimizable(str);
2841
2842     if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
2843
2844     StringValue(val);
2845     rb_str_modify(str);
2846     enc = rb_enc_check(str, val);
2847     slen = str_strlen(str, enc);
2848
2849     if (slen < beg) {
2850       out_of_range:
2851         rb_raise(rb_eIndexError, "index %ld out of string", beg);
2852     }
2853     if (beg < 0) {
2854         if (-beg > slen) {
2855             goto out_of_range;
2856         }
2857         beg += slen;
2858     }
2859     if (slen < len || slen < beg + len) {
2860         len = slen - beg;
2861     }
2862     p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
2863     if (!p) p = RSTRING_END(str);
2864     e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
2865     if (!e) e = RSTRING_END(str);
2866     /* error check */
2867     beg = p - RSTRING_PTR(str); /* physical position */
2868     len = e - p;                /* physical length */
2869     rb_str_splice_0(str, beg, len, val);
2870     rb_enc_associate(str, enc);
2871 }
2872
2873 void
2874 rb_str_update(VALUE str, long beg, long len, VALUE val)
2875 {
2876     rb_str_splice(str, beg, len, val);
2877 }
2878
2879 static void
2880 rb_str_subpat_set(VALUE str, VALUE re, int nth, VALUE val)
2881 {
2882     VALUE match;
2883     long start, end, len;
2884     rb_encoding *enc;
2885     struct re_registers *regs;
2886
2887     if (rb_reg_search(re, str, 0, 0) < 0) {
2888         rb_raise(rb_eIndexError, "regexp not matched");
2889     }
2890     match = rb_backref_get();
2891     regs = RMATCH_REGS(match);
2892     if (nth >= regs->num_regs) {
2893       out_of_range:
2894         rb_raise(rb_eIndexError, "index %d out of regexp", nth);
2895     }
2896     if (nth < 0) {
2897         if (-nth >= regs->num_regs) {
2898             goto out_of_range;
2899         }
2900         nth += regs->num_regs;
2901     }
2902
2903     start = BEG(nth);
2904     if (start == -1) {
2905         rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
2906     }
2907     end = END(nth);
2908     len = end - start;
2909     StringValue(val);
2910     enc = rb_enc_check(str, val);
2911     rb_str_splice_0(str, start, len, val);
2912     rb_enc_associate(str, enc);
2913 }
2914
2915 static VALUE
2916 rb_str_aset(VALUE str, VALUE indx, VALUE val)
2917 {
2918     long idx, beg;
2919
2920     switch (TYPE(indx)) {
2921       case T_FIXNUM:
2922         idx = FIX2LONG(indx);
2923       num_index:
2924         rb_str_splice(str, idx, 1, val);
2925         return val;
2926
2927       case T_REGEXP:
2928         rb_str_subpat_set(str, indx, 0, val);
2929         return val;
2930
2931       case T_STRING:
2932         beg = rb_str_index(str, indx, 0);
2933         if (beg < 0) {
2934             rb_raise(rb_eIndexError, "string not matched");
2935         }
2936         beg = rb_str_sublen(str, beg);
2937         rb_str_splice(str, beg, str_strlen(indx, 0), val);
2938         return val;
2939
2940       default:
2941         /* check if indx is Range */
2942         {
2943             long beg, len;
2944             if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, 0), 2)) {
2945                 rb_str_splice(str, beg, len, val);
2946                 return val;
2947             }
2948         }
2949         idx = NUM2LONG(indx);
2950         goto num_index;
2951     }
2952 }
2953
2954 /*
2955  *  call-seq:
2956  *     str[fixnum] = new_str
2957  *     str[fixnum, fixnum] = new_str
2958  *     str[range] = aString
2959  *     str[regexp] = new_str
2960  *     str[regexp, fixnum] = new_str
2961  *     str[other_str] = new_str
2962  *
2963  *  Element Assignment---Replaces some or all of the content of <i>str</i>. The
2964  *  portion of the string affected is determined using the same criteria as
2965  *  <code>String#[]</code>. If the replacement string is not the same length as
2966  *  the text it is replacing, the string will be adjusted accordingly. If the
2967  *  regular expression or string is used as the index doesn't match a position
2968  *  in the string, <code>IndexError</code> is raised. If the regular expression
2969  *  form is used, the optional second <code>Fixnum</code> allows you to specify
2970  *  which portion of the match to replace (effectively using the
2971  *  <code>MatchData</code> indexing rules. The forms that take a
2972  *  <code>Fixnum</code> will raise an <code>IndexError</code> if the value is
2973  *  out of range; the <code>Range</code> form will raise a
2974  *  <code>RangeError</code>, and the <code>Regexp</code> and <code>String</code>
2975  *  forms will silently ignore the assignment.
2976  */
2977
2978 static VALUE
2979 rb_str_aset_m(int argc, VALUE *argv, VALUE str)
2980 {
2981     if (argc == 3) {
2982         if (TYPE(argv[0]) == T_REGEXP) {
2983             rb_str_subpat_set(str, argv[0], NUM2INT(argv[1]), argv[2]);
2984         }
2985         else {
2986             rb_str_splice(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
2987         }
2988         return argv[2];
2989     }
2990     if (argc != 2) {
2991         rb_raise(rb_eArgError, "wrong number of arguments (%d for 2)", argc);
2992     }
2993     return rb_str_aset(str, argv[0], argv[1]);
2994 }
2995
2996 /*
2997  *  call-seq:
2998  *     str.insert(index, other_str)   => str
2999  *
3000  *  Inserts <i>other_str</i> before the character at the given
3001  *  <i>index</i>, modifying <i>str</i>. Negative indices count from the
3002  *  end of the string, and insert <em>after</em> the given character.
3003  *  The intent is insert <i>aString</i> so that it starts at the given
3004  *  <i>index</i>.
3005  *
3006  *     "abcd".insert(0, 'X')    #=> "Xabcd"
3007  *     "abcd".insert(3, 'X')    #=> "abcXd"
3008  *     "abcd".insert(4, 'X')    #=> "abcdX"
3009  *     "abcd".insert(-3, 'X')   #=> "abXcd"
3010  *     "abcd".insert(-1, 'X')   #=> "abcdX"
3011  */
3012
3013 static VALUE
3014 rb_str_insert(VALUE str, VALUE idx, VALUE str2)
3015 {
3016     long pos = NUM2LONG(idx);
3017
3018     if (pos == -1) {
3019         return rb_str_append(str, str2);
3020     }
3021     else if (pos < 0) {
3022         pos++;
3023     }
3024     rb_str_splice(str, pos, 0, str2);
3025     return str;
3026 }
3027
3028
3029 /*
3030  *  call-seq:
3031  *     str.slice!(fixnum)           => fixnum or nil
3032  *     str.slice!(fixnum, fixnum)   => new_str or nil
3033  *     str.slice!(range)            => new_str or nil
3034  *     str.slice!(regexp)           => new_str or nil
3035  *     str.slice!(other_str)        => new_str or nil
3036  *
3037  *  Deletes the specified portion from <i>str</i>, and returns the portion
3038  *  deleted.
3039  *
3040  *     string = "this is a string"
3041  *     string.slice!(2)        #=> "i"
3042  *     string.slice!(3..6)     #=> " is "
3043  *     string.slice!(/s.*t/)   #=> "sa st"
3044  *     string.slice!("r")      #=> "r"
3045  *     string                  #=> "thing"
3046  */
3047
3048 static VALUE
3049 rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
3050 {
3051     VALUE result;
3052     VALUE buf[3];
3053     int i;
3054
3055     if (argc < 1 || 2 < argc) {
3056         rb_raise(rb_eArgError, "wrong number of arguments (%d for 1)", argc);
3057     }
3058     for (i=0; i<argc; i++) {
3059         buf[i] = argv[i];
3060     }
3061     rb_str_modify(str);
3062     buf[i] = rb_str_new(0,0);
3063     result = rb_str_aref_m(argc, buf, str);
3064     if (!NIL_P(result)) {
3065         rb_str_aset_m(argc+1, buf, str);
3066     }
3067     return result;
3068 }
3069
3070 static VALUE
3071 get_pat(VALUE pat, int quote)
3072 {
3073     VALUE val;
3074
3075     switch (TYPE(pat)) {
3076       case T_REGEXP:
3077         return pat;
3078
3079       case T_STRING:
3080         break;
3081
3082       default:
3083         val = rb_check_string_type(pat);
3084         if (NIL_P(val)) {
3085             Check_Type(pat, T_REGEXP);
3086         }
3087         pat = val;
3088     }
3089
3090     if (quote) {
3091         pat = rb_reg_quote(pat);
3092     }
3093
3094     return rb_reg_regcomp(pat);
3095 }
3096
3097
3098 /*
3099  *  call-seq:
3100  *     str.sub!(pattern, replacement)          => str or nil
3101  *     str.sub!(pattern) {|match| block }      => str or nil
3102  *
3103  *  Performs the substitutions of <code>String#sub</code> in place,
3104  *  returning <i>str</i>, or <code>nil</code> if no substitutions were
3105  *  performed.
3106  */
3107
3108 static VALUE
3109 rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
3110 {
3111     VALUE pat, repl, hash = Qnil;
3112     int iter = 0;
3113     int tainted = 0;
3114     long plen;
3115
3116     if (argc == 1 && rb_block_given_p()) {
3117         iter = 1;
3118     }
3119     else if (argc == 2) {
3120         repl = argv[1];
3121         hash = rb_check_convert_type(argv[1], T_HASH, "Hash", "to_hash");
3122         if (NIL_P(hash)) {
3123             StringValue(repl);
3124         }
3125         if (OBJ_TAINTED(repl)) tainted = 1;
3126     }
3127     else {
3128         rb_raise(rb_eArgError, "wrong number of arguments (%d for 2)", argc);
3129     }
3130
3131     pat = get_pat(argv[0], 1);
3132     if (rb_reg_search(pat, str, 0, 0) >= 0) {
3133         rb_encoding *enc;
3134         int cr = ENC_CODERANGE(str);
3135         VALUE match = rb_backref_get();
3136         struct re_registers *regs = RMATCH_REGS(match);
3137         long beg0 = BEG(0);
3138         long end0 = END(0);
3139
3140         if (iter || !NIL_P(hash)) {
3141             char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
3142
3143             if (iter) {
3144                 repl = rb_obj_as_string(rb_yield(rb_reg_nth_match(0, match)));
3145             }
3146             else {
3147                 repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
3148                 repl = rb_obj_as_string(repl);
3149             }
3150             str_mod_check(str, p, len);
3151             str_frozen_check(str);
3152         }
3153         else {
3154             repl = rb_reg_regsub(repl, str, regs, pat);
3155         }
3156         enc = rb_enc_compatible(str, repl);
3157         if (!enc) {
3158             rb_encoding *str_enc = STR_ENC_GET(str);
3159             if (coderange_scan(RSTRING_PTR(str), beg0, str_enc) != ENC_CODERANGE_7BIT ||
3160                 coderange_scan(RSTRING_PTR(str)+end0,
3161                                RSTRING_LEN(str)-end0, str_enc) != ENC_CODERANGE_7BIT) {
3162                 rb_raise(rb_eArgError, "character encodings differ: %s and %s",
3163                          rb_enc_name(str_enc),
3164                          rb_enc_name(STR_ENC_GET(repl)));
3165             }
3166             enc = STR_ENC_GET(repl);
3167         }
3168         rb_str_modify(str);
3169         rb_enc_associate(str, enc);
3170         if (OBJ_TAINTED(repl)) tainted = 1;
3171         if (ENC_CODERANGE_UNKNOWN < cr && cr < ENC_CODERANGE_BROKEN) {
3172             int cr2 = ENC_CODERANGE(repl);
3173             if (cr2 == ENC_CODERANGE_UNKNOWN || cr2 > cr) cr = cr2;
3174         }
3175         plen = end0 - beg0;
3176         if (RSTRING_LEN(repl) > plen) {
3177             RESIZE_CAPA(str, RSTRING_LEN(str) + RSTRING_LEN(repl) - plen);
3178         }
3179         if (RSTRING_LEN(repl) != plen) {
3180             memmove(RSTRING_PTR(str) + beg0 + RSTRING_LEN(repl),
3181                     RSTRING_PTR(str) + beg0 + plen,
3182                     RSTRING_LEN(str) - beg0 - plen);
3183         }
3184         memcpy(RSTRING_PTR(str) + beg0,
3185                RSTRING_PTR(repl), RSTRING_LEN(repl));
3186         STR_SET_LEN(str, RSTRING_LEN(str) + RSTRING_LEN(repl) - plen);
3187         RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
3188         ENC_CODERANGE_SET(str, cr);
3189         if (tainted) OBJ_TAINT(str);
3190
3191         return str;
3192     }
3193     return Qnil;
3194 }
3195
3196
3197 /*
3198  *  call-seq:
3199  *     str.sub(pattern, replacement)         => new_str
3200  *     str.sub(pattern) {|match| block }     => new_str
3201  *
3202  *  Returns a copy of <i>str</i> with the <em>first</em> occurrence of
3203  *  <i>pattern</i> replaced with either <i>replacement</i> or the value of the
3204  *  block. The <i>pattern</i> will typically be a <code>Regexp</code>; if it is
3205  *  a <code>String</code> then no regular expression metacharacters will be
3206  *  interpreted (that is <code>/\d/</code> will match a digit, but
3207  *  <code>'\d'</code> will match a backslash followed by a 'd').
3208  *
3209  *  If the method call specifies <i>replacement</i>, special variables such as
3210  *  <code>$&</code> will not be useful, as substitution into the string occurs
3211  *  before the pattern match starts. However, the sequences <code>\1</code>,
3212  *  <code>\2</code>, <code>\k<group_name></code>, etc., may be used.
3213  *
3214  *  In the block form, the current match string is passed in as a parameter, and
3215  *  variables such as <code>$1</code>, <code>$2</code>, <code>$`</code>,
3216  *  <code>$&</code>, and <code>$'</code> will be set appropriately. The value
3217  *  returned by the block will be substituted for the match on each call.
3218  *
3219  *  The result inherits any tainting in the original string or any supplied
3220  *  replacement string.
3221  *
3222  *     "hello".sub(/[aeiou]/, '*')                  #=> "h*llo"
3223  *     "hello".sub(/([aeiou])/, '<\1>')             #=> "h<e>llo"
3224  *     "hello".sub(/./) {|s| s[0].ord.to_s + ' ' }  #=> "104 ello"
3225  *     "hello".sub(/(?<foo>[aeiou])/, '*\k<foo>*')  #=> "h*e*llo"
3226  */
3227
3228 static VALUE
3229 rb_str_sub(int argc, VALUE *argv, VALUE str)
3230 {
3231     str = rb_str_dup(str);
3232     rb_str_sub_bang(argc, argv, str);
3233     return str;
3234 }
3235
3236 static VALUE
3237 str_gsub(int argc, VALUE *argv, VALUE str, int bang)
3238 {
3239     VALUE pat, val, repl, match, dest, hash = Qnil;
3240     struct re_registers *regs;
3241     long beg, n;
3242     long beg0, end0;
3243     long offset, blen, slen, len, last;
3244     int iter = 0;
3245     char *sp, *cp;
3246     int tainted = 0;
3247     rb_encoding *str_enc;
3248
3249     switch (argc) {
3250       case 1:
3251         RETURN_ENUMERATOR(str, argc, argv);
3252         iter = 1;
3253         break;
3254       case 2:
3255         repl = argv[1];
3256         hash = rb_check_convert_type(argv[1], T_HASH, "Hash", "to_hash");
3257         if (NIL_P(hash)) {
3258             StringValue(repl);
3259         }
3260         if (OBJ_TAINTED(repl)) tainted = 1;
3261         break;
3262       default:
3263         rb_raise(rb_eArgError, "wrong number of arguments (%d for 2)", argc);
3264     }
3265
3266     pat = get_pat(argv[0], 1);
3267     beg = rb_reg_search(pat, str, 0, 0);
3268     if (beg < 0) {
3269         if (bang) return Qnil;  /* no match, no substitution */
3270         return rb_str_dup(str);
3271     }
3272
3273     offset = 0;
3274     n = 0;
3275     blen = RSTRING_LEN(str) + 30; /* len + margin */
3276     dest = rb_str_buf_new(blen);
3277     sp = RSTRING_PTR(str);
3278     slen = RSTRING_LEN(str);
3279     cp = sp;
3280     str_enc = STR_ENC_GET(str);
3281
3282     do {
3283         n++;
3284         match = rb_backref_get();
3285         regs = RMATCH_REGS(match);
3286         beg0 = BEG(0);
3287         end0 = END(0);
3288         if (iter || !NIL_P(hash)) {
3289             if (iter) {
3290                 val = rb_obj_as_string(rb_yield(rb_reg_nth_match(0, match)));
3291             }
3292             else {
3293                 val = rb_hash_aref(hash, rb_str_subseq(str, BEG(0), END(0) - BEG(0)));
3294                 val = rb_obj_as_string(val);
3295             }
3296             str_mod_check(str, sp, slen);
3297             if (bang) str_frozen_check(str);
3298             if (val == dest) {  /* paranoid check [ruby-dev:24827] */
3299                 rb_raise(rb_eRuntimeError, "block should not cheat");
3300             }
3301         }
3302         else {
3303             val = rb_reg_regsub(repl, str, regs, pat);
3304         }
3305
3306         if (OBJ_TAINTED(val)) tainted = 1;
3307
3308         len = beg - offset;     /* copy pre-match substr */
3309         if (len) {
3310             rb_enc_str_buf_cat(dest, cp, len, str_enc);
3311         }
3312
3313         rb_str_buf_append(dest, val);
3314
3315         last = offset;
3316         offset = end0;
3317         if (beg0 == end0) {
3318             /*
3319              * Always consume at least one character of the input string
3320              * in order to prevent infinite loops.
3321              */
3322             if (RSTRING_LEN(str) <= end0) break;
3323             len = rb_enc_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
3324             rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
3325             offset = end0 + len;
3326         }
3327         cp = RSTRING_PTR(str) + offset;
3328         if (offset > RSTRING_LEN(str)) break;
3329         beg = rb_reg_search(pat, str, offset, 0);
3330     } while (beg >= 0);
3331     if (RSTRING_LEN(str) > offset) {
3332         rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
3333     }
3334     rb_reg_search(pat, str, last, 0);
3335     if (bang) {
3336         rb_str_shared_replace(str, dest);
3337     }
3338     else {
3339         RBASIC(dest)->klass = rb_obj_class(str);
3340         OBJ_INFECT(dest, str);
3341         str = dest;
3342     }
3343
3344     if (tainted) OBJ_TAINT(str);
3345     return str;
3346 }
3347
3348
3349 /*
3350  *  call-seq:
3351  *     str.gsub!(pattern, replacement)        => str or nil
3352  *     str.gsub!(pattern) {|match| block }    => str or nil
3353  *
3354  *  Performs the substitutions of <code>String#gsub</code> in place, returning
3355  *  <i>str</i>, or <code>nil</code> if no substitutions were performed.
3356  */
3357
3358 static VALUE
3359 rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
3360 {
3361     return str_gsub(argc, argv, str, 1);
3362 }
3363
3364
3365 /*
3366  *  call-seq:
3367  *     str.gsub(pattern, replacement)       => new_str
3368  *     str.gsub(pattern) {|match| block }   => new_str
3369  *
3370  *  Returns a copy of <i>str</i> with <em>all</em> occurrences of <i>pattern</i>
3371  *  replaced with either <i>replacement</i> or the value of the block. The
3372  *  <i>pattern</i> will typically be a <code>Regexp</code>; if it is a
3373  *  <code>String</code> then no regular expression metacharacters will be
3374  *  interpreted (that is <code>/\d/</code> will match a digit, but
3375  *  <code>'\d'</code> will match a backslash followed by a 'd').
3376  *
3377  *  If a string is used as the replacement, special variables from the match
3378  *  (such as <code>$&</code> and <code>$1</code>) cannot be substituted into it,
3379  *  as substitution into the string occurs before the pattern match
3380  *  starts. However, the sequences <code>\1</code>, <code>\2</code>,
3381  *  <code>\k<group_name></code>, and so on may be used to interpolate
3382  *  successive groups in the match.
3383  *
3384  *  In the block form, the current match string is passed in as a parameter, and
3385  *  variables such as <code>$1</code>, <code>$2</code>, <code>$`</code>,
3386  *  <code>$&</code>, and <code>$'</code> will be set appropriately. The value
3387  *  returned by the block will be substituted for the match on each call.
3388  *
3389  *  The result inherits any tainting in the original string or any supplied
3390  *  replacement string.
3391  *
3392  *     "hello".gsub(/[aeiou]/, '*')                  #=> "h*ll*"
3393  *     "hello".gsub(/([aeiou])/, '<\1>')             #=> "h<e>ll<o>"
3394  *     "hello".gsub(/./) {|s| s[0].ord.to_s + ' '}   #=> "104 101 108 108 111 "
3395  *     "hello".gsub(/(?<foo>[aeiou])/, '{\k<foo>}')  #=> "h{e}ll{o}"
3396  */
3397
3398 static VALUE
3399 rb_str_gsub(int argc, VALUE *argv, VALUE str)
3400 {
3401     return str_gsub(argc, argv, str, 0);
3402 }
3403
3404
3405 /*
3406  *  call-seq:
3407  *     str.replace(other_str)   => str
3408  *
3409  *  Replaces the contents and taintedness of <i>str</i> with the corresponding
3410  *  values in <i>other_str</i>.
3411  *
3412  *     s = "hello"         #=> "hello"
3413  *     s.replace "world"   #=> "world"
3414  */
3415
3416 static VALUE
3417 rb_str_replace(VALUE str, VALUE str2)
3418 {
3419     long len;
3420     if (str == str2) return str;
3421
3422     StringValue(str2);
3423     len = RSTRING_LEN(str2);
3424     if (STR_ASSOC_P(str2)) {
3425         str2 = rb_str_new4(str2);
3426     }
3427     if (STR_SHARED_P(str2)) {
3428         if (str_independent(str) && !STR_EMBED_P(str)) {
3429             xfree(RSTRING_PTR(str));
3430         }
3431         STR_SET_NOEMBED(str);
3432         RSTRING(str)->as.heap.len = len;
3433         RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
3434         FL_SET(str, ELTS_SHARED);
3435         FL_UNSET(str, STR_ASSOC);
3436         RSTRING(str)->as.heap.aux.shared = RSTRING(str2)->as.heap.aux.shared;
3437     }
3438     else {
3439         rb_str_modify(str);
3440         str_replace_shared(str, rb_str_new4(str2));
3441     }
3442
3443     OBJ_INFECT(str, str2);
3444     rb_enc_cr_str_exact_copy(str, str2);
3445     return str;
3446 }
3447
3448 /*
3449  *  call-seq:
3450  *     string.clear    ->  string
3451  *
3452  *  Makes string empty.
3453  *
3454  *     a = "abcde"
3455  *     a.clear    #=> ""
3456  */
3457
3458 static VALUE
3459 rb_str_clear(VALUE str)
3460 {
3461     /* rb_str_modify() */       /* no need for str_make_independent */
3462     if (str_independent(str) && !STR_EMBED_P(str)) {
3463         xfree(RSTRING_PTR(str));
3464     }
3465     STR_SET_EMBED(str);
3466     STR_SET_EMBED_LEN(str, 0);
3467     RSTRING_PTR(str)[0] = 0;
3468     ENC_CODERANGE_CLEAR(str);
3469     return str;
3470 }
3471
3472 /*
3473  *  call-seq:
3474  *     string.chr    ->  string
3475  *
3476  *  Returns a one-character string at the beginning of the string.
3477  *
3478  *     a = "abcde"
3479  *     a.chr    #=> "a"
3480  */
3481
3482 static VALUE
3483 rb_str_chr(VALUE str)
3484 {
3485     return rb_str_substr(str, 0, 1);
3486 }
3487
3488 /*
3489  *  call-seq:
3490  *     str.getbyte(index)          => 0 .. 255
3491  *
3492  *  returns the <i>index</i>th byte as an integer.
3493  */
3494 static VALUE
3495 rb_str_getbyte(VALUE str, VALUE index)
3496 {
3497     long pos = NUM2LONG(index);
3498
3499     if (pos < 0)
3500         pos += RSTRING_LEN(str);
3501     if (pos < 0 ||  RSTRING_LEN(str) <= pos)
3502         return Qnil;
3503
3504     return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
3505 }
3506
3507 /*
3508  *  call-seq:
3509  *     str.setbyte(index, int) => int
3510  *
3511  *  modifies the <i>index</i>th byte as <i>int</i>.
3512  */
3513 static VALUE
3514 rb_str_setbyte(VALUE str, VALUE index, VALUE value)
3515 {
3516     long pos = NUM2LONG(index);
3517     int byte = NUM2INT(value);
3518
3519     rb_str_modify(str);
3520
3521     if (pos < -RSTRING_LEN(str) || RSTRING_LEN(str) <= pos)
3522         rb_raise(rb_eIndexError, "index %ld out of string", pos);
3523     if (pos < 0)
3524         pos += RSTRING_LEN(str);
3525
3526     RSTRING_PTR(str)[pos] = byte;
3527
3528     return value;
3529 }
3530
3531 /*
3532  *  call-seq:
3533  *     str.reverse   => new_str
3534  *
3535  *  Returns a new string with the characters from <i>str</i> in reverse order.
3536  *
3537  *     "stressed".reverse   #=> "desserts"
3538  */
3539
3540 static VALUE
3541 rb_str_reverse(VALUE str)
3542 {
3543     rb_encoding *enc;
3544     VALUE rev;
3545     char *s, *e, *p;
3546     int single = 1;
3547
3548     if (RSTRING_LEN(str) <= 1) return rb_str_dup(str);
3549     enc = STR_ENC_GET(str);
3550     rev = rb_str_new5(str, 0, RSTRING_LEN(str));
3551     s = RSTRING_PTR(str); e = RSTRING_END(str);
3552     p = RSTRING_END(rev);
3553
3554     if (RSTRING_LEN(str) > 1) {
3555         if (single_byte_optimizable(str)) {
3556             while (s < e) {
3557                 *--p = *s++;
3558             }
3559         }
3560         else {
3561             while (s < e) {
3562                 int clen = rb_enc_mbclen(s, e, enc);
3563
3564                 if (clen > 1 || (*s & 0x80)) single = 0;
3565                 p -= clen;
3566                 memcpy(p, s, clen);
3567                 s += clen;
3568             }
3569         }
3570     }
3571     STR_SET_LEN(rev, RSTRING_LEN(str));
3572     OBJ_INFECT(rev, str);
3573     if (ENC_CODERANGE(str) == ENC_CODERANGE_UNKNOWN) {
3574         if (single) {
3575             ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
3576         }
3577         else {
3578             ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
3579         }
3580     }
3581     rb_enc_cr_str_copy_for_substr(rev, str);
3582
3583     return rev;
3584 }
3585
3586
3587 /*
3588  *  call-seq:
3589  *     str.reverse!   => str
3590  *
3591  *  Reverses <i>str</i> in place.
3592  */
3593
3594 static VALUE
3595 rb_str_reverse_bang(VALUE str)
3596 {
3597     if (RSTRING_LEN(str) > 1) {
3598         if (single_byte_optimizable(str)) {
3599             char *s, *e, c;
3600             int cr = ENC_CODERANGE(str);
3601             int single = 1;
3602
3603             rb_str_modify(str);
3604             s = RSTRING_PTR(str);
3605             e = RSTRING_END(str) - 1;
3606             while (s < e) {
3607                 c = *s;
3608                 if (*s & 0x80) single = 0;
3609                 *s++ = *e;
3610                 *e-- = c;
3611             }
3612             if (cr == ENC_CODERANGE_UNKNOWN && single) {
3613                 cr = ENC_CODERANGE_7BIT;
3614             }
3615             ENC_CODERANGE_SET(str, cr);
3616         }
3617         else {
3618             rb_str_shared_replace(str, rb_str_reverse(str));
3619         }
3620     }
3621     return str;
3622 }
3623
3624
3625 /*
3626  *  call-seq:
3627  *     str.include? other_str   => true or false
3628  *
3629  *  Returns <code>true</code> if <i>str</i> contains the given string or
3630  *  character.
3631  *
3632  *     "hello".include? "lo"   #=> true
3633  *     "hello".include? "ol"   #=> false
3634  *     "hello".include? ?h     #=> true
3635  */
3636
3637 static VALUE
3638 rb_str_include(VALUE str, VALUE arg)
3639 {
3640     long i;
3641
3642     StringValue(arg);
3643     i = rb_str_index(str, arg, 0);
3644
3645     if (i == -1) return Qfalse;
3646     return Qtrue;
3647 }
3648
3649
3650 /*
3651  *  call-seq:
3652  *     str.to_i(base=10)   => integer
3653  *
3654  *  Returns the result of interpreting leading characters in <i>str</i> as an
3655  *  integer base <i>base</i> (between 2 and 36). Extraneous characters past the
3656  *  end of a valid number are ignored. If there is not a valid number at the
3657  *  start of <i>str</i>, <code>0</code> is returned. This method never raises an
3658  *  exception.
3659  *
3660  *     "12345".to_i             #=> 12345
3661  *     "99 red balloons".to_i   #=> 99
3662  *     "0a".to_i                #=> 0
3663  *     "0a".to_i(16)            #=> 10
3664  *     "hello".to_i             #=> 0
3665  *     "1100101".to_i(2)        #=> 101
3666  *     "1100101".to_i(8)        #=> 294977
3667  *     "1100101".to_i(10)       #=> 1100101
3668  *     "1100101".to_i(16)       #=> 17826049
3669  */
3670
3671 static VALUE
3672 rb_str_to_i(int argc, VALUE *argv, VALUE str)
3673 {
3674     int base;
3675
3676     if (argc == 0) base = 10;
3677     else {
3678         VALUE b;
3679
3680         rb_scan_args(argc, argv, "01", &b);
3681         base = NUM2INT(b);
3682     }
3683     if (base < 0) {
3684         rb_raise(rb_eArgError, "invalid radix %d", base);
3685     }
3686     return rb_str_to_inum(str, base, Qfalse);
3687 }
3688
3689
3690 /*
3691  *  call-seq:
3692  *     str.to_f   => float
3693  *
3694  *  Returns the result of interpreting leading characters in <i>str</i> as a
3695  *  floating point number. Extraneous characters past the end of a valid number
3696  *  are ignored. If there is not a valid number at the start of <i>str</i>,
3697  *  <code>0.0</code> is returned. This method never raises an exception.
3698  *
3699  *     "123.45e1".to_f        #=> 1234.5
3700  *     "45.67 degrees".to_f   #=> 45.67
3701  *     "thx1138".to_f         #=> 0.0
3702  */
3703
3704 static VALUE
3705 rb_str_to_f(VALUE str)
3706 {
3707     return DOUBLE2NUM(rb_str_to_dbl(str, Qfalse));
3708 }
3709
3710
3711 /*
3712  *  call-seq:
3713  *     str.to_s     => str
3714  *     str.to_str   => str
3715  *
3716  *  Returns the receiver.
3717  */
3718
3719 static VALUE
3720 rb_str_to_s(VALUE str)
3721 {
3722     if (rb_obj_class(str) != rb_cString) {
3723         VALUE dup = str_alloc(rb_cString);
3724         rb_str_replace(dup, str);
3725         return dup;
3726     }
3727     return str;
3728 }
3729
3730 static void
3731 str_cat_char(VALUE str, int c, rb_encoding *enc)
3732 {
3733     char s[16];
3734     int n = rb_enc_codelen(c, enc);
3735
3736     rb_enc_mbcput(c, s, enc);
3737     rb_enc_str_buf_cat(str, s, n, enc);
3738 }
3739
3740 static void
3741 prefix_escape(VALUE str, int c, rb_encoding *enc)
3742 {
3743     str_cat_char(str, '\\', enc);
3744     str_cat_char(str, c, enc);
3745 }
3746
3747 /*
3748  * call-seq:
3749  *   str.inspect   => string
3750  *
3751  * Returns a printable version of _str_, surrounded by quote marks,
3752  * with special characters escaped.
3753  *
3754  *    str = "hello"
3755  *    str[3] = "\b"
3756  *    str.inspect       #=> "\"hel\\bo\""
3757  */
3758
3759 VALUE
3760 rb_str_inspect(VALUE str)
3761 {
3762     rb_encoding *enc = STR_ENC_GET(str);
3763     char *p, *pend;
3764     VALUE result = rb_str_buf_new(0);
3765
3766     if (!rb_enc_asciicompat(enc)) enc = rb_usascii_encoding();
3767     rb_enc_associate(result, enc);
3768     str_cat_char(result, '"', enc);
3769     p = RSTRING_PTR(str); pend = RSTRING_END(str);
3770     while (p < pend) {
3771         int c;
3772         int n;
3773         int cc;
3774
3775         n = rb_enc_precise_mbclen(p, pend, enc);
3776         if (!MBCLEN_CHARFOUND_P(n)) {
3777             p++;
3778             n = 1;
3779             goto escape_codepoint;
3780         }
3781         n = MBCLEN_CHARFOUND_LEN(n);
3782
3783         c = rb_enc_codepoint(p, pend, enc);
3784         n = rb_enc_codelen(c, enc);
3785
3786         p += n;
3787         if (c == '"'|| c == '\\' ||
3788             (c == '#' &&
3789              p < pend &&
3790              MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p,pend,enc)) &&
3791              (cc = rb_enc_codepoint(p,pend,enc),
3792               (cc == '$' || cc == '@' || cc == '{')))) {
3793             prefix_escape(result, c, enc);
3794         }
3795         else if (c == '\n') {
3796             prefix_escape(result, 'n', enc);
3797         }
3798         else if (c == '\r') {
3799             prefix_escape(result, 'r', enc);
3800         }
3801         else if (c == '\t') {
3802             prefix_escape(result, 't', enc);
3803         }
3804         else if (c == '\f') {
3805             prefix_escape(result, 'f', enc);
3806         }
3807         else if (c == '\013') {
3808             prefix_escape(result, 'v', enc);
3809         }
3810         else if (c == '\010') {
3811             prefix_escape(result, 'b', enc);
3812         }
3813         else if (c == '\007') {
3814             prefix_escape(result, 'a', enc);
3815         }
3816         else if (c == 033) {
3817             prefix_escape(result, 'e', enc);
3818         }
3819         else if (rb_enc_isprint(c, enc)) {
3820             rb_enc_str_buf_cat(result, p-n, n, enc);
3821         }
3822         else {
3823             char buf[5];
3824             char *s;
3825             char *q;
3826
3827           escape_codepoint:
3828             for (q = p-n; q < p; q++) {
3829                 s = buf;
3830                 sprintf(buf, "\\x%02X", *q & 0377);
3831                 while (*s) {
3832                     str_cat_char(result, *s++, enc);
3833                 }
3834             }
3835         }
3836     }
3837     str_cat_char(result, '"', enc);
3838
3839     OBJ_INFECT(result, str);
3840     return result;
3841 }
3842
3843 #define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
3844
3845 /*
3846  *  call-seq:
3847  *     str.dump   => new_str
3848  *
3849  *  Produces a version of <i>str</i> with all nonprinting characters replaced by
3850  *  <code>\nnn</code> notation and all special characters escaped.
3851  */
3852
3853 VALUE
3854 rb_str_dump(VALUE str)
3855 {
3856     rb_encoding *enc0 = rb_enc_get(str);
3857     long len;
3858     const char *p, *pend;
3859     char *q, *qend;
3860     VALUE result;
3861
3862     len = 2;                    /* "" */
3863     p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
3864     while (p < pend) {
3865         unsigned char c = *p++;
3866         switch (c) {
3867           case '"':  case '\\':
3868           case '\n': case '\r':
3869           case '\t': case '\f':
3870           case '\013': case '\010': case '\007': case '\033':
3871             len += 2;
3872             break;
3873
3874           case '#':
3875             len += IS_EVSTR(p, pend) ? 2 : 1;
3876             break;
3877
3878           default:
3879             if (ISPRINT(c)) {
3880                 len++;
3881             }
3882             else {
3883                 len += 4;               /* \xNN */
3884             }
3885             break;
3886         }
3887     }
3888     if (!rb_enc_asciicompat(enc0)) {
3889         len += 19;              /* ".force_encoding('')" */
3890         len += strlen(enc0->name);
3891     }
3892
3893     result = rb_str_new5(str, 0, len);
3894     p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
3895     q = RSTRING_PTR(result); qend = q + len;
3896
3897     *q++ = '"';
3898     while (p < pend) {
3899         unsigned char c = *p++;
3900
3901         if (c == '"' || c == '\\') {
3902             *q++ = '\\';
3903             *q++ = c;
3904         }
3905         else if (c == '#') {
3906             if (IS_EVSTR(p, pend)) *q++ = '\\';
3907             *q++ = '#';
3908         }
3909         else if (c == '\n') {
3910             *q++ = '\\';
3911             *q++ = 'n';
3912         }
3913         else if (c == '\r') {
3914             *q++ = '\\';
3915             *q++ = 'r';
3916         }
3917         else if (c == '\t') {
3918             *q++ = '\\';
3919             *q++ = 't';
3920         }
3921         else if (c == '\f') {
3922             *q++ = '\\';
3923             *q++ = 'f';
3924         }
3925         else if (c == '\013') {
3926             *q++ = '\\';
3927             *q++ = 'v';
3928         }
3929         else if (c == '\010') {
3930             *q++ = '\\';
3931             *q++ = 'b';
3932         }
3933         else if (c == '\007') {
3934             *q++ = '\\';
3935             *q++ = 'a';
3936         }
3937         else if (c == '\033') {
3938             *q++ = '\\';
3939             *q++ = 'e';
3940         }
3941         else if (ISPRINT(c)) {
3942             *q++ = c;
3943         }
3944         else {
3945             *q++ = '\\';
3946             sprintf(q, "x%02X", c);
3947             q += 3;
3948         }
3949     }
3950     *q++ = '"';
3951     if (!rb_enc_asciicompat(enc0)) {
3952         sprintf(q, ".force_encoding(\"%s\")", enc0->name);
3953         enc0 = rb_ascii8bit_encoding();
3954     }
3955
3956     OBJ_INFECT(result, str);
3957     /* result from dump is ASCII */
3958     rb_enc_associate(result, enc0);
3959     return result;
3960 }
3961
3962
3963 /*
3964  *  call-seq:
3965  *     str.upcase!   => str or nil
3966  *
3967  *  Upcases the contents of <i>str</i>, returning <code>nil</code> if no changes
3968  *  were made.
3969  *  Note: case replacement is effective only in ASCII region.
3970  */
3971
3972 static VALUE
3973 rb_str_upcase_bang(VALUE str)
3974 {
3975     rb_encoding *enc;
3976     char *s, *send;
3977     int modify = 0;
3978     int cr = ENC_CODERANGE(str);
3979
3980     rb_str_modify(str);
3981     enc = STR_ENC_GET(str);
3982     s = RSTRING_PTR(str); send = RSTRING_END(str);
3983     while (s < send) {
3984         int c = rb_enc_codepoint(s, send, enc);
3985
3986         if (rb_enc_islower(c, enc)) {
3987             /* assuming toupper returns codepoint with same size */
3988             rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
3989             modify = 1;
3990         }
3991         s += rb_enc_codelen(c, enc);
3992     }
3993
3994     ENC_CODERANGE_SET(str, cr);
3995     if (modify) return str;
3996     return Qnil;
3997 }
3998
3999
4000 /*
4001  *  call-seq:
4002  *     str.upcase   => new_str
4003  *
4004  *  Returns a copy of <i>str</i> with all lowercase letters replaced with their
4005  *  uppercase counterparts. The operation is locale insensitive---only
4006  *  characters ``a'' to ``z'' are affected.
4007  *  Note: case replacement is effective only in ASCII region.
4008  *
4009  *     "hEllO".upcase   #=> "HELLO"
4010  */
4011
4012 static VALUE
4013 rb_str_upcase(VALUE str)
4014 {
4015     str = rb_str_dup(str);
4016     rb_str_upcase_bang(str);
4017     return str;
4018 }
4019
4020
4021 /*
4022  *  call-seq:
4023  *     str.downcase!   => str or nil
4024  *
4025  *  Downcases the contents of <i>str</i>, returning <code>nil</code> if no
4026  *  changes were made.
4027  *  Note: case replacement is effective only in ASCII region.
4028  */
4029
4030 static VALUE
4031 rb_str_downcase_bang(VALUE str)
4032 {
4033     rb_encoding *enc;
4034     char *s, *send;
4035     int modify = 0;
4036     int cr = ENC_CODERANGE(str);
4037
4038     rb_str_modify(str);
4039     enc = STR_ENC_GET(str);
4040     s = RSTRING_PTR(str); send = RSTRING_END(str);
4041     while (s < send) {
4042         int c = rb_enc_codepoint(s, send, enc);
4043
4044         if (rb_enc_isupper(c, enc)) {
4045             /* assuming toupper returns codepoint with same size */
4046             rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
4047             modify = 1;
4048         }
4049         s += rb_enc_codelen(c, enc);
4050     }
4051
4052     ENC_CODERANGE_SET(str, cr);
4053     if (modify) return str;
4054     return Qnil;
4055 }
4056
4057
4058 /*
4059  *  call-seq:
4060  *     str.downcase   => new_str
4061  *
4062  *  Returns a copy of <i>str</i> with all uppercase letters replaced with their
4063  *  lowercase counterparts. The operation is locale insensitive---only
4064  *  characters ``A'' to ``Z'' are affected.
4065  *  Note: case replacement is effective only in ASCII region.
4066  *
4067  *     "hEllO".downcase   #=> "hello"
4068  */
4069
4070 static VALUE
4071 rb_str_downcase(VALUE str)
4072 {
4073     str = rb_str_dup(str);
4074     rb_str_downcase_bang(str);
4075     return str;
4076 }
4077
4078
4079 /*
4080  *  call-seq:
4081  *     str.capitalize!   => str or nil
4082  *
4083  *  Modifies <i>str</i> by converting the first character to uppercase and the
4084  *  remainder to lowercase. Returns <code>nil</code> if no changes are made.
4085  *  Note: case conversion is effective only in ASCII region.
4086  *
4087  *     a = "hello"
4088  *     a.capitalize!   #=> "Hello"
4089  *     a               #=> "Hello"
4090  *     a.capitalize!   #=> nil
4091  */
4092
4093 static VALUE
4094 rb_str_capitalize_bang(VALUE str)
4095 {
4096     rb_encoding *enc;
4097     char *s, *send;
4098     int modify = 0;
4099     int c;
4100     int cr = ENC_CODERANGE(str);
4101
4102     rb_str_modify(str);
4103     enc = STR_ENC_GET(str);
4104     if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
4105     s = RSTRING_PTR(str); send = RSTRING_END(str);
4106
4107     c = rb_enc_codepoint(s, send, enc);
4108     if (rb_enc_islower(c, enc)) {
4109         rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
4110         modify = 1;
4111     }
4112     s += rb_enc_codelen(c, enc);
4113     while (s < send) {
4114         c = rb_enc_codepoint(s, send, enc);
4115         if (rb_enc_isupper(c, enc)) {
4116             rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
4117             modify = 1;
4118         }
4119         s += rb_enc_codelen(c, enc);
4120     }
4121
4122     ENC_CODERANGE_SET(str, cr);
4123     if (modify) return str;
4124     return Qnil;
4125 }
4126
4127
4128 /*
4129  *  call-seq:
4130  *     str.capitalize   => new_str
4131  *
4132  *  Returns a copy of <i>str</i> with the first character converted to uppercase
4133  *  and the remainder to lowercase.
4134  *  Note: case conversion is effective only in ASCII region.
4135  *
4136  *     "hello".capitalize    #=> "Hello"
4137  *     "HELLO".capitalize    #=> "Hello"
4138  *     "123ABC".capitalize   #=> "123abc"
4139  */
4140
4141 static VALUE
4142 rb_str_capitalize(VALUE str)
4143 {
4144     str = rb_str_dup(str);
4145     rb_str_capitalize_bang(str);
4146     return str;
4147 }
4148
4149
4150 /*
4151  *  call-seq:
4152 *     str.swapcase!   => str or nil
4153  *
4154  *  Equivalent to <code>String#swapcase</code>, but modifies the receiver in
4155  *  place, returning <i>str</i>, or <code>nil</code> if no changes were made.
4156  *  Note: case conversion is effective only in ASCII region.
4157  */
4158
4159 static VALUE
4160 rb_str_swapcase_bang(VALUE str)
4161 {
4162     rb_encoding *enc;
4163     char *s, *send;
4164     int modify = 0;
4165     int cr = ENC_CODERANGE(str);
4166
4167     rb_str_modify(str);
4168     enc = STR_ENC_GET(str);
4169     s = RSTRING_PTR(str); send = RSTRING_END(str);
4170     while (s < send) {
4171         int c = rb_enc_codepoint(s, send, enc);
4172
4173         if (rb_enc_isupper(c, enc)) {
4174             /* assuming toupper returns codepoint with same size */
4175             rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
4176             modify = 1;
4177         }
4178         else if (rb_enc_islower(c, enc)) {
4179             /* assuming toupper returns codepoint with same size */
4180             rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
4181             modify = 1;
4182         }
4183         s += rb_enc_codelen(c, enc);
4184     }
4185
4186     ENC_CODERANGE_SET(str, cr);
4187     if (modify) return str;
4188     return Qnil;
4189 }
4190
4191
4192 /*
4193  *  call-seq:
4194  *     str.swapcase   => new_str
4195  *
4196  *  Returns a copy of <i>str</i> with uppercase alphabetic characters converted
4197  *  to lowercase and lowercase characters converted to uppercase.
4198  *  Note: case conversion is effective only in ASCII region.
4199  *
4200  *     "Hello".swapcase          #=> "hELLO"
4201  *     "cYbEr_PuNk11".swapcase   #=> "CyBeR_pUnK11"
4202  */
4203
4204 static VALUE
4205 rb_str_swapcase(VALUE str)
4206 {
4207     str = rb_str_dup(str);
4208     rb_str_swapcase_bang(str);
4209     return str;
4210 }
4211
4212 typedef unsigned char *USTR;
4213
4214 struct tr {
4215     int gen, now, max;
4216     char *p, *pend;
4217 };
4218
4219 static int
4220 trnext(struct tr *t, rb_encoding *enc)
4221 {
4222     for (;;) {
4223         if (!t->gen) {
4224             if (t->p == t->pend) return -1;
4225             if (t->p < t->pend - 1 && *t->p == '\\') {
4226                 t->p++;
4227             }
4228             t->now = rb_enc_codepoint(t->p, t->pend, enc);
4229             t->p += rb_enc_codelen(t->now, enc);
4230             if (t->p < t->pend - 1 && *t->p == '-') {
4231                 t->p++;
4232                 if (t->p < t->pend) {
4233                     int c = rb_enc_codepoint(t->p, t->pend, enc);
4234                     t->p += rb_enc_codelen(c, enc);
4235                     if (t->now > c) continue;
4236                     t->gen = 1;
4237                     t->max = c;
4238                 }
4239             }
4240             return t->now;
4241         }
4242         else if (++t->now < t->max) {
4243             return t->now;
4244         }
4245         else {
4246             t->gen = 0;
4247             return t->max;
4248         }
4249     }
4250 }
4251
4252 static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
4253
4254 static VALUE
4255 tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
4256 {
4257     int trans[256];
4258     rb_encoding *enc, *e1, *e2;
4259     struct tr trsrc, trrepl;
4260     int cflag = 0;
4261     int c, c0, last = 0, modify = 0, i, l;
4262     char *s, *send;
4263     VALUE hash = 0;
4264     int singlebyte = single_byte_optimizable(str);
4265
4266     StringValue(src);
4267     StringValue(repl);
4268     if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
4269     if (RSTRING_LEN(repl) == 0) {
4270         return rb_str_delete_bang(1, &src, str);
4271     }
4272
4273     e1 = rb_enc_check(str, src);
4274     e2 = rb_enc_check(str, repl);
4275     if (e1 == e2) {
4276         enc = e1;
4277     }
4278     else {
4279         enc = rb_enc_check(src, repl);
4280     }
4281     trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
4282     if (RSTRING_LEN(src) > 1 &&
4283         rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
4284         trsrc.p + l < trsrc.pend) {
4285         cflag = 1;
4286         trsrc.p += l;
4287     }
4288     trrepl.p = RSTRING_PTR(repl);
4289     trrepl.pend = trrepl.p + RSTRING_LEN(repl);
4290     trsrc.gen = trrepl.gen = 0;
4291     trsrc.now = trrepl.now = 0;
4292     trsrc.max = trrepl.max = 0;
4293
4294     if (cflag) {
4295         for (i=0; i<256; i++) {
4296             trans[i] = 1;
4297         }
4298         while ((c = trnext(&trsrc, enc)) >= 0) {
4299             if (c < 256) {
4300                 trans[c] = -1;
4301             }
4302             else {
4303                 if (!hash) hash = rb_hash_new();
4304                 rb_hash_aset(hash, INT2NUM(c), Qtrue);
4305             }
4306         }
4307         while ((c = trnext(&trrepl, enc)) >= 0)
4308             /* retrieve last replacer */;
4309         last = trrepl.now;
4310         for (i=0; i<256; i++) {
4311             if (trans[i] >= 0) {
4312                 trans[i] = last;
4313             }
4314         }
4315     }
4316     else {
4317         int r;
4318
4319         for (i=0; i<256; i++) {
4320             trans[i] = -1;
4321         }
4322         while ((c = trnext(&trsrc, enc)) >= 0) {
4323             r = trnext(&trrepl, enc);
4324             if (r == -1) r = trrepl.now;
4325             if (c < 256) {
4326                 trans[c] = r;
4327                 if (r > 255) singlebyte = 0;
4328             }
4329             else {
4330                 if (!hash) hash = rb_hash_new();
4331                 rb_hash_aset(hash, INT2NUM(c), INT2NUM(r));
4332             }
4333         }
4334     }
4335
4336     rb_str_modify(str);
4337     s = RSTRING_PTR(str); send = RSTRING_END(str);
4338     if (sflag) {
4339         int clen, tlen, max = RSTRING_LEN(str);
4340         int offset, save = -1;
4341         char *buf = ALLOC_N(char, max), *t = buf;
4342
4343         while (s < send) {
4344             c0 = c = rb_enc_codepoint(s, send, enc);
4345             tlen = clen = rb_enc_codelen(c, enc);
4346
4347             s += clen;
4348             if (c < 256) {
4349                 c = trans[c];
4350             }
4351             else if (hash) {
4352                 VALUE tmp = rb_hash_lookup(hash, INT2NUM(c));
4353                 if (NIL_P(tmp)) {
4354                     if (cflag) c = last;
4355                     else c = -1;
4356                 }
4357                 else if (cflag) c = -1;
4358                 else c = NUM2INT(tmp);
4359             }
4360             else {
4361                 c = -1;
4362             }
4363             if (c >= 0) {
4364                 if (save == c) continue;
4365                 save = c;
4366                 tlen = rb_enc_codelen(c, enc);
4367                 modify = 1;
4368             }
4369             else {
4370                 save = -1;
4371                 c = c0;
4372             }
4373             while (t - buf + tlen >= max) {
4374                 offset = t - buf;
4375                 max *= 2;
4376                 REALLOC_N(buf, char, max);
4377                 t = buf + offset;
4378             }
4379             rb_enc_mbcput(c, t, enc);
4380             t += tlen;
4381         }
4382         *t = '\0';
4383         RSTRING(str)->as.heap.ptr = buf;
4384         RSTRING(str)->as.heap.len = t - buf;
4385         STR_SET_NOEMBED(str);
4386         RSTRING(str)->as.heap.aux.capa = max;
4387     }
4388     else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
4389         while (s < send) {
4390             c = (unsigned char)*s;
4391             if (trans[c] >= 0) {
4392                 if (!cflag) {
4393                     c = trans[c];
4394                     *s = c;
4395                     modify = 1;
4396                 }
4397                 else {
4398                     *s = last;
4399                     modify = 1;
4400                 }
4401             }
4402             s++;
4403         }
4404     }
4405     else {
4406         int clen, tlen, max = RSTRING_LEN(str) * 1.2;
4407         int offset;
4408         char *buf = ALLOC_N(char, max), *t = buf;
4409
4410         while (s < send) {
4411             c0 = c = rb_enc_codepoint(s, send, enc);
4412             tlen = clen = rb_enc_codelen(c, enc);
4413
4414             if (c < 256) {
4415                 c = trans[c];
4416             }
4417             else if (hash) {
4418                 VALUE tmp = rb_hash_lookup(hash, INT2NUM(c));
4419                 if (NIL_P(tmp)) {
4420                     if (cflag) c = last;
4421                     else c = -1;
4422                 }
4423                 else if (cflag) c = -1;
4424                 else c = NUM2INT(tmp);
4425             }
4426             else {
4427                 c = -1;
4428             }
4429             if (c >= 0) {
4430                 tlen = rb_enc_codelen(c, enc);
4431                 modify = 1;
4432             }
4433             else {
4434                 modify = 1;
4435                 c = c0;
4436             }
4437             while (t - buf + tlen >= max) {
4438                 offset = t - buf;
4439                 max *= 2;
4440                 REALLOC_N(buf, char, max);
4441                 t = buf + offset;
4442             }
4443             if (s != t) rb_enc_mbcput(c, t, enc);
4444             s += clen;
4445             t += tlen;
4446         }
4447         if (!STR_EMBED_P(str)) {
4448             xfree(RSTRING(str)->as.heap.ptr);
4449         }
4450         *t = '\0';
4451         RSTRING(str)->as.heap.ptr = buf;
4452         RSTRING(str)->as.heap.len = t - buf;
4453         STR_SET_NOEMBED(str);
4454         RSTRING(str)->as.heap.aux.capa = max;
4455     }
4456
4457     if (modify) {
4458         rb_enc_associate(str, enc);
4459         return str;
4460     }
4461     return Qnil;
4462 }
4463
4464
4465 /*
4466  *  call-seq:
4467  *     str.tr!(from_str, to_str)   => str or nil
4468  *
4469  *  Translates <i>str</i> in place, using the same rules as
4470  *  <code>String#tr</code>. Returns <i>str</i>, or <code>nil</code> if no
4471  *  changes were made.
4472  */
4473
4474 static VALUE
4475 rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
4476 {
4477     return tr_trans(str, src, repl, 0);
4478 }
4479
4480
4481 /*
4482  *  call-seq:
4483  *     str.tr(from_str, to_str)   => new_str
4484  *
4485  *  Returns a copy of <i>str</i> with the characters in <i>from_str</i> replaced
4486  *  by the corresponding characters in <i>to_str</i>. If <i>to_str</i> is
4487  *  shorter than <i>from_str</i>, it is padded with its last character. Both
4488  *  strings may use the c1--c2 notation to denote ranges of characters, and
4489  *  <i>from_str</i> may start with a <code>^</code>, which denotes all
4490  *  characters except those listed.
4491  *
4492  *     "hello".tr('aeiou', '*')    #=> "h*ll*"
4493  *     "hello".tr('^aeiou', '*')   #=> "*e**o"
4494  *     "hello".tr('el', 'ip')      #=> "hippo"
4495  *     "hello".tr('a-y', 'b-z')    #=> "ifmmp"
4496  */
4497
4498 static VALUE
4499 rb_str_tr(VALUE str, VALUE src, VALUE repl)
4500 {
4501     str = rb_str_dup(str);
4502     tr_trans(str, src, repl, 0);
4503     return str;
4504 }
4505
4506 static void
4507 tr_setup_table(VALUE str, char stable[256], int first,
4508                VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
4509 {
4510     char buf[256];
4511     struct tr tr;
4512     int c, l;
4513     VALUE table = 0, ptable = 0;
4514     int i, cflag = 0;
4515
4516     tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
4517     tr.gen = tr.now = tr.max = 0;
4518
4519     if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
4520         cflag = 1;
4521         tr.p += l;
4522     }
4523     if (first) {
4524         for (i=0; i<256; i++) {
4525             stable[i] = 1;
4526         }
4527     }
4528     for (i=0; i<256; i++) {
4529         buf[i] = cflag;
4530     }
4531
4532     while ((c = trnext(&tr, enc)) >= 0) {
4533         if (c < 256) {
4534             buf[c & 0xff] = !cflag;
4535         }
4536         else {
4537             VALUE key = INT2NUM(c);
4538
4539             if (!table) {
4540                 table = rb_hash_new();
4541                 if (cflag) {
4542                     ptable = *ctablep;
4543                     *ctablep = table;
4544                 }
4545                 else {
4546                     ptable = *tablep;
4547                     *tablep = table;
4548                 }
4549             }
4550             if (!ptable || !NIL_P(rb_hash_aref(ptable, key))) {
4551                 rb_hash_aset(table, key, Qtrue);
4552             }
4553         }
4554     }
4555     for (i=0; i<256; i++) {
4556         stable[i] = stable[i] && buf[i];
4557     }
4558 }
4559
4560
4561 static int
4562 tr_find(int c, char table[256], VALUE del, VALUE nodel)
4563 {
4564     if (c < 256) {
4565         return table[c] ? Qtrue : Qfalse;
4566     }
4567     else {
4568         VALUE v = INT2NUM(c);
4569
4570         if (del && !NIL_P(rb_hash_lookup(del, v))) {
4571             if (!nodel || NIL_P(rb_hash_lookup(nodel, v))) {
4572                 return Qtrue;
4573             }
4574         }
4575         return Qfalse;
4576     }
4577 }
4578
4579 /*
4580  *  call-seq:
4581  *     str.delete!([other_str]+)   => str or nil
4582  *
4583  *  Performs a <code>delete</code> operation in place, returning <i>str</i>, or
4584  *  <code>nil</code> if <i>str</i> was not modified.
4585  */
4586
4587 static VALUE
4588 rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
4589 {
4590     char squeez[256];
4591     rb_encoding *enc = 0;
4592     char *s, *send, *t;
4593     VALUE del = 0, nodel = 0;
4594     int modify = 0;
4595     int i;
4596     int cr;
4597
4598     if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
4599     cr = ENC_CODERANGE(str);
4600     if (argc < 1) {
4601         rb_raise(rb_eArgError, "wrong number of arguments");
4602     }
4603     for (i=0; i<argc; i++) {
4604         VALUE s = argv[i];
4605
4606         StringValue(s);
4607         enc = rb_enc_check(str, s);
4608         tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
4609     }
4610
4611     rb_str_modify(str);
4612     s = t = RSTRING_PTR(str);
4613     if (!s || RSTRING_LEN(str) == 0) return Qnil;
4614     send = RSTRING_END(str);
4615     while (s < send) {
4616         int c = rb_enc_codepoint(s, send, enc);
4617         int clen = rb_enc_codelen(c, enc);
4618
4619         if (tr_find(c, squeez, del, nodel)) {
4620             modify = 1;
4621         }
4622         else {
4623             if (t != s) rb_enc_mbcput(c, t, enc);
4624             t += clen;
4625         }
4626         s += clen;
4627     }
4628     *t = '\0';
4629     STR_SET_LEN(str, t - RSTRING_PTR(str));
4630
4631     ENC_CODERANGE_SET(str, cr);
4632     if (modify) return str;
4633     return Qnil;
4634 }
4635
4636
4637 /*
4638  *  call-seq:
4639  *     str.delete([other_str]+)   => new_str
4640  *
4641  *  Returns a copy of <i>str</i> with all characters in the intersection of its
4642  *  arguments deleted. Uses the same rules for building the set of characters as
4643  *  <code>String#count</code>.
4644  *
4645  *     "hello".delete "l","lo"        #=> "heo"
4646  *     "hello".delete "lo"            #=> "he"
4647  *     "hello".delete "aeiou", "^e"   #=> "hell"
4648  *     "hello".delete "ej-m"          #=> "ho"
4649  */
4650
4651 static VALUE
4652 rb_str_delete(int argc, VALUE *argv, VALUE str)
4653 {
4654     str = rb_str_dup(str);
4655     rb_str_delete_bang(argc, argv, str);
4656     return str;
4657 }
4658
4659
4660 /*
4661  *  call-seq:
4662  *     str.squeeze!([other_str]*)   => str or nil
4663  *
4664  *  Squeezes <i>str</i> in place, returning either <i>str</i>, or
4665  *  <code>nil</code> if no changes were made.
4666  */
4667
4668 static VALUE
4669 rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
4670 {
4671     char squeez[256];
4672     rb_encoding *enc = 0;
4673     VALUE del = 0, nodel = 0;
4674     char *s, *send, *t;
4675     int save, modify = 0;
4676     int i;
4677
4678     if (argc == 0) {
4679         enc = STR_ENC_GET(str);
4680     }
4681     else {
4682         for (i=0; i<argc; i++) {
4683             VALUE s = argv[i];
4684
4685             StringValue(s);
4686             enc = rb_enc_check(str, s);
4687             tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
4688         }
4689     }
4690
4691     rb_str_modify(str);
4692     s = t = RSTRING_PTR(str);
4693     if (!s || RSTRING_LEN(str) == 0) return Qnil;
4694     send = RSTRING_END(str);
4695     save = -1;
4696     while (s < send) {
4697         int c = rb_enc_codepoint(s, send, enc);
4698         int clen = rb_enc_codelen(c, enc);
4699
4700         if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
4701             if (t != s) rb_enc_mbcput(c, t, enc);
4702             save = c;
4703             t += clen;
4704         }
4705         s += clen;
4706     }
4707     *t = '\0';
4708     if (t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
4709         STR_SET_LEN(str, t - RSTRING_PTR(str));
4710         modify = 1;
4711     }
4712
4713     if (modify) return str;
4714     return Qnil;
4715 }
4716
4717
4718 /*
4719  *  call-seq:
4720  *     str.squeeze([other_str]*)    => new_str
4721  *
4722  *  Builds a set of characters from the <i>other_str</i> parameter(s) using the
4723  *  procedure described for <code>String#count</code>. Returns a new string
4724  *  where runs of the same character that occur in this set are replaced by a
4725  *  single character. If no arguments are given, all runs of identical
4726  *  characters are replaced by a single character.
4727  *
4728  *     "yellow moon".squeeze                  #=> "yelow mon"
4729  *     "  now   is  the".squeeze(" ")         #=> " now is the"
4730  *     "putters shoot balls".squeeze("m-z")   #=> "puters shot balls"
4731  */
4732
4733 static VALUE
4734 rb_str_squeeze(int argc, VALUE *argv, VALUE str)
4735 {
4736     str = rb_str_dup(str);
4737     rb_str_squeeze_bang(argc, argv, str);
4738     return str;
4739 }
4740
4741
4742 /*
4743  *  call-seq:
4744  *     str.tr_s!(from_str, to_str)   => str or nil
4745  *
4746  *  Performs <code>String#tr_s</code> processing on <i>str</i> in place,
4747  *  returning <i>str</i>, or <code>nil</code> if no changes were made.
4748  */
4749
4750 static VALUE
4751 rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
4752 {
4753     return tr_trans(str, src, repl, 1);
4754 }
4755
4756
4757 /*
4758  *  call-seq:
4759  *     str.tr_s(from_str, to_str)   => new_str
4760  *
4761  *  Processes a copy of <i>str</i> as described under <code>String#tr</code>,
4762  *  then removes duplicate characters in regions that were affected by the
4763  *  translation.
4764  *
4765  *     "hello".tr_s('l', 'r')     #=> "hero"
4766  *     "hello".tr_s('el', '*')    #=> "h*o"
4767  *     "hello".tr_s('el', 'hx')   #=> "hhxo"
4768  */
4769
4770 static VALUE
4771 rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
4772 {
4773     str = rb_str_dup(str);
4774     tr_trans(str, src, repl, 1);
4775     return str;
4776 }
4777
4778
4779 /*
4780  *  call-seq:
4781  *     str.count([other_str]+)   => fixnum
4782  *
4783  *  Each <i>other_str</i> parameter defines a set of characters to count.  The
4784  *  intersection of these sets defines the characters to count in
4785  *  <i>str</i>. Any <i>other_str</i> that starts with a caret (^) is
4786  *  negated. The sequence c1--c2 means all characters between c1 and c2.
4787  *
4788  *     a = "hello world"
4789  *     a.count "lo"            #=> 5
4790  *     a.count "lo", "o"       #=> 2
4791  *     a.count "hello", "^l"   #=> 4
4792  *     a.count "ej-m"          #=> 4
4793  */
4794
4795 static VALUE
4796 rb_str_count(int argc, VALUE *argv, VALUE str)
4797 {
4798     char table[256];
4799     rb_encoding *enc = 0;
4800     VALUE del = 0, nodel = 0;
4801     char *s, *send;
4802     int i;
4803
4804     if (argc < 1) {
4805         rb_raise(rb_eArgError, "wrong number of arguments");
4806     }
4807     for (i=0; i<argc; i++) {
4808         VALUE s = argv[i];
4809
4810         StringValue(s);
4811         enc = rb_enc_check(str, s);
4812         tr_setup_table(s, table,i==0, &del, &nodel, enc);
4813     }
4814
4815     s = RSTRING_PTR(str);
4816     if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
4817     send = RSTRING_END(str);
4818     i = 0;
4819     while (s < send) {
4820         int c = rb_enc_codepoint(s, send, enc);
4821         int clen = rb_enc_codelen(c, enc);
4822
4823         if (tr_find(c, table, del, nodel)) {
4824             i++;
4825         }
4826         s += clen;
4827     }
4828     return INT2NUM(i);
4829 }
4830
4831
4832 /*
4833  *  call-seq:
4834  *     str.split(pattern=$;, [limit])   => anArray
4835  *
4836  *  Divides <i>str</i> into substrings based on a delimiter, returning an array
4837  *  of these substrings.
4838  *
4839  *  If <i>pattern</i> is a <code>String</code>, then its contents are used as
4840  *  the delimiter when splitting <i>str</i>. If <i>pattern</i> is a single
4841  *  space, <i>str</i> is split on whitespace, with leading whitespace and runs
4842  *  of contiguous whitespace characters ignored.
4843  *
4844  *  If <i>pattern</i> is a <code>Regexp</code>, <i>str</i> is divided where the
4845  *  pattern matches. Whenever the pattern matches a zero-length string,
4846  *  <i>str</i> is split into individual characters. If <i>pattern</i> contains
4847  *  groups, the respective matches will be returned in the array as well.
4848  *
4849  *  If <i>pattern</i> is omitted, the value of <code>$;</code> is used.  If
4850  *  <code>$;</code> is <code>nil</code> (which is the default), <i>str</i> is
4851  *  split on whitespace as if ` ' were specified.
4852  *
4853  *  If the <i>limit</i> parameter is omitted, trailing null fields are
4854  *  suppressed. If <i>limit</i> is a positive number, at most that number of
4855  *  fields will be returned (if <i>limit</i> is <code>1</code>, the entire
4856  *  string is returned as the only entry in an array). If negative, there is no
4857  *  limit to the number of fields returned, and trailing null fields are not
4858  *  suppressed.
4859  *
4860  *     " now's  the time".split        #=> ["now's", "the", "time"]
4861  *     " now's  the time".split(' ')   #=> ["now's", "the", "time"]
4862  *     " now's  the time".split(/ /)   #=> ["", "now's", "", "the", "time"]
4863  *     "1, 2.34,56, 7".split(%r{,\s*}) #=> ["1", "2.34", "56", "7"]
4864  *     "hello".split(//)               #=> ["h", "e", "l", "l", "o"]
4865  *     "hello".split(//, 3)            #=> ["h", "e", "llo"]
4866  *     "hi mom".split(%r{\s*})         #=> ["h", "i", "m", "o", "m"]
4867  *
4868  *     "mellow yellow".split("ello")   #=> ["m", "w y", "w"]
4869  *     "1,2,,3,4,,".split(',')         #=> ["1", "2", "", "3", "4"]
4870  *     "1,2,,3,4,,".split(',', 4)      #=> ["1", "2", "", "3,4,,"]
4871  *     "1,2,,3,4,,".split(',', -4)     #=> ["1", "2", "", "3", "4", "", ""]
4872  */
4873
4874 static VALUE
4875 rb_str_split_m(int argc, VALUE *argv, VALUE str)
4876 {
4877     rb_encoding *enc;
4878     VALUE spat;
4879     VALUE limit;
4880     int awk_split = Qfalse;
4881     long beg, end, i = 0;
4882     int lim = 0;
4883     VALUE result, tmp;
4884
4885     if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
4886         lim = NUM2INT(limit);
4887         if (lim <= 0) limit = Qnil;
4888         else if (lim == 1) {
4889             if (RSTRING_LEN(str) == 0)
4890                 return rb_ary_new2(0);
4891             return rb_ary_new3(1, str);
4892         }
4893         i = 1;
4894     }
4895
4896     enc = STR_ENC_GET(str);
4897     if (NIL_P(spat)) {
4898         if (!NIL_P(rb_fs)) {
4899             spat = rb_fs;
4900             goto fs_set;
4901         }
4902         awk_split = Qtrue;
4903     }
4904     else {
4905       fs_set:
4906         if (TYPE(spat) == T_STRING) {
4907             rb_encoding *enc2 = STR_ENC_GET(spat);
4908
4909             if (rb_enc_mbminlen(enc2) == 1) {
4910                 if (RSTRING_LEN(spat) == 1 && RSTRING_PTR(spat)[0] == ' '){
4911                     awk_split = Qtrue;
4912                 }
4913             }
4914             else {
4915                 int l;
4916                 if (rb_enc_ascget(RSTRING_PTR(spat), RSTRING_END(spat), &l, enc2) == ' ' &&
4917                     RSTRING_LEN(spat) == l) {
4918                     awk_split = Qtrue;
4919                 }
4920             }
4921             if (!awk_split) {
4922                 spat = rb_reg_regcomp(rb_reg_quote(spat));
4923             }
4924         }
4925         else {
4926             spat = get_pat(spat, 1);
4927         }
4928     }
4929
4930     result = rb_ary_new();
4931     beg = 0;
4932     if (awk_split) {
4933         char *ptr = RSTRING_PTR(str);
4934         char *eptr = RSTRING_END(str);
4935         char *bptr = ptr;
4936         int skip = 1;
4937         int c;
4938
4939         end = beg;
4940         while (ptr < eptr) {
4941             c = rb_enc_codepoint(ptr, eptr, enc);
4942             ptr += rb_enc_mbclen(ptr, eptr, enc);
4943             if (skip) {
4944                 if (rb_enc_isspace(c, enc)) {
4945                     beg = ptr - bptr;
4946                 }
4947                 else {
4948                     end = ptr - bptr;
4949                     skip = 0;
4950                     if (!NIL_P(limit) && lim <= i) break;
4951                 }
4952             }
4953             else {
4954                 if (rb_enc_isspace(c, enc)) {
4955                     rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
4956                     skip = 1;
4957                     beg = ptr - bptr;
4958                     if (!NIL_P(limit)) ++i;
4959                 }
4960                 else {
4961                     end = ptr - bptr;
4962                 }
4963             }
4964         }
4965     }
4966     else {
4967         long start = beg;
4968         long idx;
4969         int last_null = 0;
4970         struct re_registers *regs;
4971
4972         while ((end = rb_reg_search(spat, str, start, 0)) >= 0) {
4973             regs = RMATCH_REGS(rb_backref_get());
4974             if (start == end && BEG(0) == END(0)) {
4975                 if (!RSTRING_PTR(str)) {
4976                     rb_ary_push(result, rb_str_new("", 0));
4977                     break;
4978                 }
4979                 else if (last_null == 1) {
4980                     rb_ary_push(result, rb_str_subseq(str, beg,
4981                                                       rb_enc_mbclen(RSTRING_PTR(str)+beg,
4982                                                                     RSTRING_END(str),
4983                                                                     enc)));
4984                     beg = start;
4985                 }
4986                 else {
4987                     if (RSTRING_PTR(str)+start == RSTRING_END(str))
4988                         start++;
4989                     else
4990                         start += rb_enc_mbclen(RSTRING_PTR(str)+start,RSTRING_END(str),enc);
4991                     last_null = 1;
4992                     continue;
4993                 }
4994             }
4995             else {
4996                 rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
4997                 beg = start = END(0);
4998             }
4999             last_null = 0;
5000
5001             for (idx=1; idx < regs->num_regs; idx++) {
5002                 if (BEG(idx) == -1) continue;
5003                 if (BEG(idx) == END(idx))
5004                     tmp = rb_str_new5(str, 0, 0);
5005                 else
5006                     tmp = rb_str_subseq(str, BEG(idx), END(idx)-BEG(idx));
5007                 rb_ary_push(result, tmp);
5008             }
5009             if (!NIL_P(limit) && lim <= ++i) break;
5010         }
5011     }
5012     if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
5013         if (RSTRING_LEN(str) == beg)
5014             tmp = rb_str_new5(str, 0, 0);
5015         else
5016             tmp = rb_str_subseq(str, beg, RSTRING_LEN(str)-beg);
5017         rb_ary_push(result, tmp);
5018     }
5019     if (NIL_P(limit) && lim == 0) {
5020         while (RARRAY_LEN(result) > 0 &&
5021                RSTRING_LEN(RARRAY_PTR(result)[RARRAY_LEN(result)-1]) == 0)
5022             rb_ary_pop(result);
5023     }
5024
5025     return result;
5026 }
5027
5028 VALUE
5029 rb_str_split(VALUE str, const char *sep0)
5030 {
5031     VALUE sep;
5032
5033     StringValue(str);
5034     sep = rb_str_new2(sep0);
5035     return rb_str_split_m(1, &sep, str);
5036 }
5037
5038
5039 /*
5040  *  Document-method: lines
5041  *  call-seq:
5042  *     str.lines(separator=$/)   => anEnumerator
5043  *     str.lines(separator=$/) {|substr| block }        => str
5044  *
5045  *  Returns an enumerator that gives each line in the string.  If a block is
5046  *  given, it iterates over each line in the string.
5047  *
5048  *     "foo\nbar\n".lines.to_a   #=> ["foo\n", "bar\n"]
5049  *     "foo\nb ar".lines.sort    #=> ["b ar", "foo\n"]
5050  */
5051
5052 /*
5053  *  Document-method: each_line
5054  *  call-seq:
5055  *     str.each_line(separator=$/) {|substr| block }   => str
5056  *
5057  *  Splits <i>str</i> using the supplied parameter as the record separator
5058  *  (<code>$/</code> by default), passing each substring in turn to the supplied
5059  *  block. If a zero-length record separator is supplied, the string is split
5060  *  into paragraphs delimited by multiple successive newlines.
5061  *
5062  *     print "Example one\n"
5063  *     "hello\nworld".each {|s| p s}
5064  *     print "Example two\n"
5065  *     "hello\nworld".each('l') {|s| p s}
5066  *     print "Example three\n"
5067  *     "hello\n\n\nworld".each('') {|s| p s}
5068  *
5069  *  <em>produces:</em>
5070  *
5071  *     Example one
5072  *     "hello\n"
5073  *     "world"
5074  *     Example two
5075  *     "hel"
5076  *     "l"
5077  *     "o\nworl"
5078  *     "d"
5079  *     Example three
5080  *     "hello\n\n\n"
5081  *     "world"
5082  */
5083
5084 static VALUE
5085 rb_str_each_line(int argc, VALUE *argv, VALUE str)
5086 {
5087     rb_encoding *enc;
5088     VALUE rs;
5089     int newline;
5090     char *p, *pend, *s, *ptr;
5091     long len, rslen;
5092     VALUE line;
5093     int n;
5094     VALUE orig = str;
5095
5096     if (argc == 0) {
5097         rs = rb_rs;
5098     }
5099     else {
5100         rb_scan_args(argc, argv, "01", &rs);
5101     }
5102     RETURN_ENUMERATOR(str, argc, argv);
5103     if (NIL_P(rs)) {
5104         rb_yield(str);
5105         return orig;
5106     }
5107     str = rb_str_new4(str);
5108     ptr = p = s = RSTRING_PTR(str);
5109     pend = p + RSTRING_LEN(str);
5110     len = RSTRING_LEN(str);
5111     StringValue(rs);
5112     if (rs == rb_default_rs) {
5113         enc = rb_enc_get(str);
5114         while (p < pend) {
5115             char *p0;
5116
5117             p = memchr(p, '\n', pend - p);
5118             if (!p) break;
5119             p0 = rb_enc_left_char_head(s, p, enc);
5120             if (!rb_enc_is_newline(p0, pend, enc)) {
5121                 p++;
5122                 continue;
5123             }
5124             p = p0 + rb_enc_mbclen(p0, pend, enc);
5125             line = rb_str_new5(str, s, p - s);
5126             OBJ_INFECT(line, str);
5127             rb_enc_cr_str_copy_for_substr(line, str);
5128             rb_yield(line);
5129             str_mod_check(str, ptr, len);
5130             s = p;
5131         }
5132         goto finish;
5133     }
5134
5135     enc = rb_enc_check(str, rs);
5136     rslen = RSTRING_LEN(rs);
5137     if (rslen == 0) {
5138         newline = '\n';
5139     }
5140     else {
5141         newline = rb_enc_codepoint(RSTRING_PTR(rs), RSTRING_END(rs), enc);
5142     }
5143
5144     while (p < pend) {
5145         int c = rb_enc_codepoint(p, pend, enc);
5146
5147       again:
5148         n = rb_enc_codelen(c, enc);
5149         if (rslen == 0 && c == newline) {
5150             p += n;
5151             if (p < pend && (c = rb_enc_codepoint(p, pend, enc)) != newline) {
5152                 goto again;
5153             }
5154             while (p < pend && rb_enc_codepoint(p, pend, enc) == newline) {
5155                 p += n;
5156             }
5157             p -= n;
5158         }
5159         if (c == newline &&
5160             (rslen <= 1 || memcmp(RSTRING_PTR(rs), p, rslen) == 0)) {
5161             line = rb_str_new5(str, s, p - s + (rslen ? rslen : n));
5162             OBJ_INFECT(line, str);
5163             rb_enc_cr_str_copy_for_substr(line, str);
5164             rb_yield(line);
5165             str_mod_check(str, ptr, len);
5166             s = p + (rslen ? rslen : n);
5167         }
5168         p += n;
5169     }
5170
5171   finish:
5172     if (s != pend) {
5173         line = rb_str_new5(str, s, pend - s);
5174         OBJ_INFECT(line, str);
5175         rb_enc_cr_str_copy_for_substr(line, str);
5176         rb_yield(line);
5177     }
5178
5179     return orig;
5180 }
5181
5182
5183 /*
5184  *  Document-method: bytes
5185  *  call-seq:
5186  *     str.bytes   => anEnumerator
5187  *     str.bytes {|fixnum| block }    => str
5188  *
5189  *  Returns an enumerator that gives each byte in the string.  If a block is
5190  *  given, it iterates over each byte in the string.
5191  *
5192  *     "hello".bytes.to_a        #=> [104, 101, 108, 108, 111]
5193  */
5194
5195 /*
5196  *  Document-method: each_byte
5197  *  call-seq:
5198  *     str.each_byte {|fixnum| block }    => str
5199  *
5200  *  Passes each byte in <i>str</i> to the given block.
5201  *
5202  *     "hello".each_byte {|c| print c, ' ' }
5203  *
5204  *  <em>produces:</em>
5205  *
5206  *     104 101 108 108 111
5207  */
5208
5209 static VALUE
5210 rb_str_each_byte(VALUE str)
5211 {
5212     long i;
5213
5214     RETURN_ENUMERATOR(str, 0, 0);
5215     for (i=0; i<RSTRING_LEN(str); i++) {
5216         rb_yield(INT2FIX(RSTRING_PTR(str)[i] & 0xff));
5217     }
5218     return str;
5219 }
5220
5221
5222 /*
5223  *  Document-method: chars
5224  *  call-seq:
5225  *     str.chars                   => anEnumerator
5226  *     str.chars {|substr| block } => str
5227  *
5228  *  Returns an enumerator that gives each character in the string.
5229  *  If a block is given, it iterates over each character in the string.
5230  *
5231  *     "foo".chars.to_a   #=> ["f","o","o"]
5232  */
5233
5234 /*
5235  *  Document-method: each_char
5236  *  call-seq:
5237  *     str.each_char {|cstr| block }    => str
5238  *
5239  *  Passes each character in <i>str</i> to the given block.
5240  *
5241  *     "hello".each_char {|c| print c, ' ' }
5242  *
5243  *  <em>produces:</em>
5244  *
5245  *     h e l l o
5246  */
5247
5248 static VALUE
5249 rb_str_each_char(VALUE str)
5250 {
5251     int i, len, n;
5252     const char *ptr;
5253     rb_encoding *enc;
5254
5255     RETURN_ENUMERATOR(str, 0, 0);
5256     str = rb_str_new4(str);
5257     ptr = RSTRING_PTR(str);
5258     len = RSTRING_LEN(str);
5259     enc = rb_enc_get(str);
5260     for (i = 0; i < len; i += n) {
5261         n = rb_enc_mbclen(ptr + i, ptr + len, enc);
5262         rb_yield(rb_str_subseq(str, i, n));
5263     }
5264     return str;
5265 }
5266
5267 static long
5268 chopped_length(VALUE str)
5269 {
5270     rb_encoding *enc = STR_ENC_GET(str);
5271     const char *p, *p2, *beg, *end;
5272
5273     beg = RSTRING_PTR(str);
5274     end = beg + RSTRING_LEN(str);
5275     if (beg > end) return 0;
5276     p = rb_enc_prev_char(beg, end, enc);
5277     if (!p) return 0;
5278     if (p > beg && rb_enc_codepoint(p, end, enc) == '\n') {
5279         p2 = rb_enc_prev_char(beg, p, enc);
5280         if (p2 && rb_enc_codepoint(p2, end, enc) == '\r') p = p2;
5281     }
5282     return p - beg;
5283 }
5284
5285 /*
5286  *  call-seq:
5287  *     str.chop!   => str or nil
5288  *
5289  *  Processes <i>str</i> as for <code>String#chop</code>, returning <i>str</i>,
5290  *  or <code>nil</code> if <i>str</i> is the empty string.  See also
5291  *  <code>String#chomp!</code>.
5292  */
5293
5294 static VALUE
5295 rb_str_chop_bang(VALUE str)
5296 {
5297     if (RSTRING_LEN(str) > 0) {
5298         long len;
5299         rb_str_modify(str);
5300         len = chopped_length(str);
5301         STR_SET_LEN(str, len);
5302         RSTRING_PTR(str)[len] = '\0';
5303         return str;
5304     }
5305     return Qnil;
5306 }
5307
5308
5309 /*
5310  *  call-seq:
5311  *     str.chop   => new_str
5312  *
5313  *  Returns a new <code>String</code> with the last character removed.  If the
5314  *  string ends with <code>\r\n</code>, both characters are removed. Applying
5315  *  <code>chop</code> to an empty string returns an empty
5316  *  string. <code>String#chomp</code> is often a safer alternative, as it leaves
5317  *  the string unchanged if it doesn't end in a record separator.
5318  *
5319  *     "string\r\n".chop   #=> "string"
5320  *     "string\n\r".chop   #=> "string\n"
5321  *     "string\n".chop     #=> "string"
5322  *     "string".chop       #=> "strin"
5323  *     "x".chop.chop       #=> ""
5324  */
5325
5326 static VALUE
5327 rb_str_chop(VALUE str)
5328 {
5329     VALUE str2 = rb_str_new5(str, RSTRING_PTR(str), chopped_length(str));
5330     rb_enc_cr_str_copy_for_substr(str2, str);
5331     OBJ_INFECT(str2, str);
5332     return str2;
5333 }
5334
5335
5336 /*
5337  *  call-seq:
5338  *     str.chomp!(separator=$/)   => str or nil
5339  *
5340  *  Modifies <i>str</i> in place as described for <code>String#chomp</code>,
5341  *  returning <i>str</i>, or <code>nil</code> if no modifications were made.
5342  */
5343
5344 static VALUE
5345 rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
5346 {
5347     rb_encoding *enc;
5348     VALUE rs;
5349     int newline;
5350     char *p, *pp, *e;
5351     long len, rslen;
5352
5353     len = RSTRING_LEN(str);
5354     if (len == 0) return Qnil;
5355     p = RSTRING_PTR(str);
5356     e = p + len;
5357     if (argc == 0) {
5358         rs = rb_rs;
5359         if (rs == rb_default_rs) {
5360           smart_chomp:
5361             rb_str_modify(str);
5362             enc = rb_enc_get(str);
5363             if (rb_enc_mbminlen(enc) > 1) {
5364                 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), enc);
5365                 if (rb_enc_is_newline(pp, e, enc)) {
5366                     e = pp;
5367                 }
5368                 pp = e - rb_enc_mbminlen(enc);
5369                 if (pp >= p) {
5370                     pp = rb_enc_left_char_head(p, pp, enc);
5371                     if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
5372                         e = pp;
5373                     }
5374                 }
5375                 if (e == RSTRING_END(str)) {
5376                     return Qnil;
5377                 }
5378                 len = e - RSTRING_PTR(str);
5379                 STR_SET_LEN(str, len);
5380             }
5381             else {
5382                 if (RSTRING_PTR(str)[len-1] == '\n') {
5383                     STR_DEC_LEN(str);
5384                     if (RSTRING_LEN(str) > 0 &&
5385                         RSTRING_PTR(str)[RSTRING_LEN(str)-1] == '\r') {
5386                         STR_DEC_LEN(str);
5387                     }
5388                 }
5389                 else if (RSTRING_PTR(str)[len-1] == '\r') {
5390                     STR_DEC_LEN(str);
5391                 }
5392                 else {
5393                     return Qnil;
5394                 }
5395             }
5396             RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
5397             return str;
5398         }
5399     }
5400     else {
5401         rb_scan_args(argc, argv, "01", &rs);
5402     }
5403     if (NIL_P(rs)) return Qnil;
5404     StringValue(rs);
5405     rslen = RSTRING_LEN(rs);
5406     if (rslen == 0) {
5407         while (len>0 && p[len-1] == '\n') {
5408             len--;
5409             if (len>0 && p[len-1] == '\r')
5410                 len--;
5411         }
5412         if (len < RSTRING_LEN(str)) {
5413             rb_str_modify(str);
5414             STR_SET_LEN(str, len);
5415             RSTRING_PTR(str)[len] = '\0';
5416             return str;
5417         }
5418         return Qnil;
5419     }
5420     if (rslen > len) return Qnil;
5421     newline = RSTRING_PTR(rs)[rslen-1];
5422     if (rslen == 1 && newline == '\n')
5423         goto smart_chomp;
5424
5425     enc = rb_enc_check(str, rs);
5426     if (is_broken_string(rs)) {
5427         return Qnil;
5428     }
5429     pp = e - rslen;
5430     if (p[len-1] == newline &&
5431         (rslen <= 1 ||
5432          memcmp(RSTRING_PTR(rs), pp, rslen) == 0)) {
5433         if (rb_enc_left_char_head(p, pp, enc) != pp)
5434             return Qnil;
5435         rb_str_modify(str);
5436         STR_SET_LEN(str, RSTRING_LEN(str) - rslen);
5437         RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
5438         return str;
5439     }
5440     return Qnil;
5441 }
5442
5443
5444 /*
5445  *  call-seq:
5446  *     str.chomp(separator=$/)   => new_str
5447  *
5448  *  Returns a new <code>String</code> with the given record separator removed
5449  *  from the end of <i>str</i> (if present). If <code>$/</code> has not been
5450  *  changed from the default Ruby record separator, then <code>chomp</code> also
5451  *  removes carriage return characters (that is it will remove <code>\n</code>,
5452  *  <code>\r</code>, and <code>\r\n</code>).
5453  *
5454  *     "hello".chomp            #=> "hello"
5455  *     "hello\n".chomp          #=> "hello"
5456  *     "hello\r\n".chomp        #=> "hello"
5457  *     "hello\n\r".chomp        #=> "hello\n"
5458  *     "hello\r".chomp          #=> "hello"
5459  *     "hello \n there".chomp   #=> "hello \n there"
5460  *     "hello".chomp("llo")     #=> "he"
5461  */
5462
5463 static VALUE
5464 rb_str_chomp(int argc, VALUE *argv, VALUE str)
5465 {
5466     str = rb_str_dup(str);
5467     rb_str_chomp_bang(argc, argv, str);
5468     return str;
5469 }
5470
5471 /*
5472  *  call-seq:
5473  *     str.lstrip!   => self or nil
5474  *
5475  *  Removes leading whitespace from <i>str</i>, returning <code>nil</code> if no
5476  *  change was made. See also <code>String#rstrip!</code> and
5477  *  <code>String#strip!</code>.
5478  *
5479  *     "  hello  ".lstrip   #=> "hello  "
5480  *     "hello".lstrip!      #=> nil
5481  */
5482
5483 static VALUE
5484 rb_str_lstrip_bang(VALUE str)
5485 {
5486     rb_encoding *enc;
5487     char *s, *t, *e;
5488
5489     rb_str_modify(str);
5490     enc = STR_ENC_GET(str);
5491     s = RSTRING_PTR(str);
5492     if (!s || RSTRING_LEN(str) == 0) return Qnil;
5493     e = t = RSTRING_END(str);
5494     /* remove spaces at head */
5495     while (s < e) {
5496         int cc = rb_enc_codepoint(s, e, enc);
5497
5498         if (!rb_enc_isspace(cc, enc)) break;
5499         s += rb_enc_codelen(cc, enc);
5500     }
5501
5502     if (s > RSTRING_PTR(str)) {
5503         rb_str_modify(str);
5504         STR_SET_LEN(str, t-s);
5505         memmove(RSTRING_PTR(str), s, RSTRING_LEN(str));
5506         RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
5507         return str;
5508     }
5509     return Qnil;
5510 }
5511
5512
5513 /*
5514  *  call-seq:
5515  *     str.lstrip   => new_str
5516  *
5517  *  Returns a copy of <i>str</i> with leading whitespace removed. See also
5518  *  <code>String#rstrip</code> and <code>String#strip</code>.
5519  *
5520  *     "  hello  ".lstrip   #=> "hello  "
5521  *     "hello".lstrip       #=> "hello"
5522  */
5523
5524 static VALUE
5525 rb_str_lstrip(VALUE str)
5526 {
5527     str = rb_str_dup(str);
5528     rb_str_lstrip_bang(str);
5529     return str;
5530 }
5531
5532
5533 /*
5534  *  call-seq:
5535  *     str.rstrip!   => self or nil
5536  *
5537  *  Removes trailing whitespace from <i>str</i>, returning <code>nil</code> if
5538  *  no change was made. See also <code>String#lstrip!</code> and
5539  *  <code>String#strip!</code>.
5540  *
5541  *     "  hello  ".rstrip   #=> "  hello"
5542  *     "hello".rstrip!      #=> nil
5543  */
5544
5545 static VALUE
5546 rb_str_rstrip_bang(VALUE str)
5547 {
5548     rb_encoding *enc;
5549     char *s, *t, *e;
5550     int space_seen = Qfalse;
5551
5552     rb_str_modify(str);
5553     enc = STR_ENC_GET(str);
5554     s = RSTRING_PTR(str);
5555     if (!s || RSTRING_LEN(str) == 0) return Qnil;
5556     t = e = RSTRING_END(str);
5557     while (s < e) {
5558         int cc = rb_enc_codepoint(s, e, enc);
5559
5560         if (!cc || rb_enc_isspace(cc, enc)) {
5561             if (!space_seen) t = s;
5562             space_seen = Qtrue;
5563         }
5564         else {
5565             space_seen = Qfalse;
5566         }
5567         s += rb_enc_codelen(cc, enc);
5568     }
5569     if (!space_seen) t = s;
5570     if (t < e) {
5571         rb_str_modify(str);
5572         STR_SET_LEN(str, t-RSTRING_PTR(str));
5573         RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
5574         return str;
5575     }
5576     return Qnil;
5577 }
5578
5579
5580 /*
5581  *  call-seq:
5582  *     str.rstrip   => new_str
5583  *
5584  *  Returns a copy of <i>str</i> with trailing whitespace removed. See also
5585  *  <code>String#lstrip</code> and <code>String#strip</code>.
5586  *
5587  *     "  hello  ".rstrip   #=> "  hello"
5588  *     "hello".rstrip       #=> "hello"
5589  */
5590
5591 static VALUE
5592 rb_str_rstrip(VALUE str)
5593 {
5594     str = rb_str_dup(str);
5595     rb_str_rstrip_bang(str);
5596     return str;
5597 }
5598
5599
5600 /*
5601  *  call-seq:
5602  *     str.strip!   => str or nil
5603  *
5604  *  Removes leading and trailing whitespace from <i>str</i>. Returns
5605  *  <code>nil</code> if <i>str</i> was not altered.
5606  */
5607
5608 static VALUE
5609 rb_str_strip_bang(VALUE str)
5610 {
5611     VALUE l = rb_str_lstrip_bang(str);
5612     VALUE r = rb_str_rstrip_bang(str);
5613
5614     if (NIL_P(l) && NIL_P(r)) return Qnil;
5615     return str;
5616 }
5617
5618
5619 /*
5620  *  call-seq:
5621  *     str.strip   => new_str
5622  *
5623  *  Returns a copy of <i>str</i> with leading and trailing whitespace removed.
5624  *
5625  *     "    hello    ".strip   #=> "hello"
5626  *     "\tgoodbye\r\n".strip   #=> "goodbye"
5627  */
5628
5629 static VALUE
5630 rb_str_strip(VALUE str)
5631 {
5632     str = rb_str_dup(str);
5633     rb_str_strip_bang(str);
5634     return str;
5635 }
5636
5637 static VALUE
5638 scan_once(VALUE str, VALUE pat, long *start)
5639 {
5640     VALUE result, match;
5641     struct re_registers *regs;
5642     long i;
5643
5644     if (rb_reg_search(pat, str, *start, 0) >= 0) {
5645         match = rb_backref_get();
5646         regs = RMATCH_REGS(match);
5647         if (BEG(0) == END(0)) {
5648             rb_encoding *enc = STR_ENC_GET(str);
5649             /*
5650              * Always consume at least one character of the input string
5651              */
5652             if (RSTRING_LEN(str) > END(0))
5653                 *start = END(0)+rb_enc_mbclen(RSTRING_PTR(str)+END(0),
5654                                               RSTRING_END(str), enc);
5655             else
5656                 *start = END(0)+1;
5657         }
5658         else {
5659             *start = END(0);
5660         }
5661         if (regs->num_regs == 1) {
5662             return rb_reg_nth_match(0, match);
5663         }
5664         result = rb_ary_new2(regs->num_regs);
5665         for (i=1; i < regs->num_regs; i++) {
5666             rb_ary_push(result, rb_reg_nth_match(i, match));
5667         }
5668
5669         return result;
5670     }
5671     return Qnil;
5672 }
5673
5674
5675 /*
5676  *  call-seq:
5677  *     str.scan(pattern)                         => array
5678  *     str.scan(pattern) {|match, ...| block }   => str
5679  *
5680  *  Both forms iterate through <i>str</i>, matching the pattern (which may be a
5681  *  <code>Regexp</code> or a <code>String</code>). For each match, a result is
5682  *  generated and either added to the result array or passed to the block. If
5683  *  the pattern contains no groups, each individual result consists of the
5684  *  matched string, <code>$&</code>.  If the pattern contains groups, each
5685  *  individual result is itself an array containing one entry per group.
5686  *
5687  *     a = "cruel world"
5688  *     a.scan(/\w+/)        #=> ["cruel", "world"]
5689  *     a.scan(/.../)        #=> ["cru", "el ", "wor"]
5690  *     a.scan(/(...)/)      #=> [["cru"], ["el "], ["wor"]]
5691  *     a.scan(/(..)(..)/)   #=> [["cr", "ue"], ["l ", "wo"]]
5692  *
5693  *  And the block form:
5694  *
5695  *     a.scan(/\w+/) {|w| print "<<#{w}>> " }
5696  *     print "\n"
5697  *     a.scan(/(.)(.)/) {|x,y| print y, x }
5698  *     print "\n"
5699  *
5700  *  <em>produces:</em>
5701  *
5702  *     <<cruel>> <<world>>
5703  *     rceu lowlr
5704  */
5705
5706 static VALUE
5707 rb_str_scan(VALUE str, VALUE pat)
5708 {
5709     VALUE result;
5710     long start = 0;
5711     long last = -1, prev = 0;
5712     char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
5713
5714     pat = get_pat(pat, 1);
5715     if (!rb_block_given_p()) {
5716         VALUE ary = rb_ary_new();
5717
5718         while (!NIL_P(result = scan_once(str, pat, &start))) {
5719             last = prev;
5720             prev = start;
5721             rb_ary_push(ary, result);
5722         }
5723         if (last >= 0) rb_reg_search(pat, str, last, 0);
5724         return ary;
5725     }
5726
5727     while (!NIL_P(result = scan_once(str, pat, &start))) {
5728         last = prev;
5729         prev = start;
5730         rb_yield(result);
5731         str_mod_check(str, p, len);
5732     }
5733     if (last >= 0) rb_reg_search(pat, str, last, 0);
5734     return str;
5735 }
5736
5737
5738 /*
5739  *  call-seq:
5740  *     str.hex   => integer
5741  *
5742  *  Treats leading characters from <i>str</i> as a string of hexadecimal digits
5743  *  (with an optional sign and an optional <code>0x</code>) and returns the
5744  *  corresponding number. Zero is returned on error.
5745  *
5746  *     "0x0a".hex     #=> 10
5747  *     "-1234".hex    #=> -4660
5748  *     "0".hex        #=> 0
5749  *     "wombat".hex   #=> 0
5750  */
5751
5752 static VALUE
5753 rb_str_hex(VALUE str)
5754 {
5755     rb_encoding *enc = rb_enc_get(str);
5756
5757     if (!rb_enc_asciicompat(enc)) {
5758         rb_raise(rb_eArgError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
5759     }
5760     return rb_str_to_inum(str, 16, Qfalse);
5761 }
5762
5763
5764 /*
5765  *  call-seq:
5766  *     str.oct   => integer
5767  *
5768  *  Treats leading characters of <i>str</i> as a string of octal digits (with an
5769  *  optional sign) and returns the corresponding number.  Returns 0 if the
5770  *  conversion fails.
5771  *
5772  *     "123".oct       #=> 83
5773  *     "-377".oct      #=> -255
5774  *     "bad".oct       #=> 0
5775  *     "0377bad".oct   #=> 255
5776  */
5777
5778 static VALUE
5779 rb_str_oct(VALUE str)
5780 {
5781     rb_encoding *enc = rb_enc_get(str);
5782
5783     if (!rb_enc_asciicompat(enc)) {
5784         rb_raise(rb_eArgError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
5785     }
5786     return rb_str_to_inum(str, -8, Qfalse);
5787 }
5788
5789
5790 /*
5791  *  call-seq:
5792  *     str.crypt(other_str)   => new_str
5793  *
5794  *  Applies a one-way cryptographic hash to <i>str</i> by invoking the standard
5795  *  library function <code>crypt</code>. The argument is the salt string, which
5796  *  should be two characters long, each character drawn from
5797  *  <code>[a-zA-Z0-9./]</code>.
5798  */
5799
5800 static VALUE
5801 rb_str_crypt(VALUE str, VALUE salt)
5802 {
5803     extern char *crypt(const char *, const char *);
5804     VALUE result;
5805     const char *s;
5806
5807     StringValue(salt);
5808     if (RSTRING_LEN(salt) < 2)
5809         rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
5810
5811     if (RSTRING_PTR(str)) s = RSTRING_PTR(str);
5812     else s = "";
5813     result = rb_str_new2(crypt(s, RSTRING_PTR(salt)));
5814     OBJ_INFECT(result, str);
5815     OBJ_INFECT(result, salt);
5816     return result;
5817 }
5818
5819
5820 /*
5821  *  call-seq:
5822  *     str.intern   => symbol
5823  *     str.to_sym   => symbol
5824  *
5825  *  Returns the <code>Symbol</code> corresponding to <i>str</i>, creating the
5826  *  symbol if it did not previously exist. See <code>Symbol#id2name</code>.
5827  *
5828  *     "Koala".intern         #=> :Koala
5829  *     s = 'cat'.to_sym       #=> :cat
5830  *     s == :cat              #=> true
5831  *     s = '@cat'.to_sym      #=> :@cat
5832  *     s == :@cat             #=> true
5833  *
5834  *  This can also be used to create symbols that cannot be represented using the
5835  *  <code>:xxx</code> notation.
5836  *
5837  *     'cat and dog'.to_sym   #=> :"cat and dog"
5838  */
5839
5840 VALUE
5841 rb_str_intern(VALUE s)
5842 {
5843     VALUE str = RB_GC_GUARD(s);
5844     ID id;
5845
5846     id = rb_intern_str(str);
5847     return ID2SYM(id);
5848 }
5849
5850
5851 /*
5852  *  call-seq:
5853  *     str.ord   => integer
5854  *
5855  *  Return the <code>Integer</code> ordinal of a one-character string.
5856  *
5857  *     "a".ord         #=> 97
5858  */
5859
5860 VALUE
5861 rb_str_ord(VALUE s)
5862 {
5863     int c;
5864
5865     c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s));
5866     return INT2NUM(c);
5867 }
5868 /*
5869  *  call-seq:
5870  *     str.sum(n=16)   => integer
5871  *
5872  *  Returns a basic <em>n</em>-bit checksum of the characters in <i>str</i>,
5873  *  where <em>n</em> is the optional <code>Fixnum</code> parameter, defaulting
5874  *  to 16. The result is simply the sum of the binary value of each character in
5875  *  <i>str</i> modulo <code>2n - 1</code>. This is not a particularly good
5876  *  checksum.
5877  */
5878
5879 static VALUE
5880 rb_str_sum(int argc, VALUE *argv, VALUE str)
5881 {
5882     VALUE vbits;
5883     int bits;
5884     char *ptr, *p, *pend;
5885     long len;
5886
5887     if (argc == 0) {
5888         bits = 16;
5889     }
5890     else {
5891         rb_scan_args(argc, argv, "01", &vbits);
5892         bits = NUM2INT(vbits);
5893     }
5894     ptr = p = RSTRING_PTR(str);
5895     len = RSTRING_LEN(str);
5896     pend = p + len;
5897     if (bits >= sizeof(long)*CHAR_BIT) {
5898         VALUE sum = INT2FIX(0);
5899
5900         while (p < pend) {
5901             str_mod_check(str, ptr, len);
5902             sum = rb_funcall(sum, '+', 1, INT2FIX((unsigned char)*p));
5903             p++;
5904         }
5905         if (bits != 0) {
5906             VALUE mod;
5907
5908             mod = rb_funcall(INT2FIX(1), rb_intern("<<"), 1, INT2FIX(bits));
5909             mod = rb_funcall(mod, '-', 1, INT2FIX(1));
5910             sum = rb_funcall(sum, '&', 1, mod);
5911         }
5912         return sum;
5913     }
5914     else {
5915        unsigned long sum = 0;
5916
5917         while (p < pend) {
5918             str_mod_check(str, ptr, len);
5919             sum += (unsigned char)*p;
5920             p++;
5921         }
5922         if (bits != 0) {
5923            sum &= (((unsigned long)1)<<bits)-1;
5924         }
5925         return rb_int2inum(sum);
5926     }
5927 }
5928
5929 static VALUE
5930 rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
5931 {
5932     rb_encoding *enc;
5933     VALUE w;
5934     long width, len, flen = 1, fclen = 1;
5935     VALUE res;
5936     char *p;
5937     const char *f = " ";
5938     long n, llen, rlen;
5939     volatile VALUE pad;
5940     int singlebyte = 1;
5941
5942     rb_scan_args(argc, argv, "11", &w, &pad);
5943     enc = STR_ENC_GET(str);
5944     width = NUM2LONG(w);
5945     if (argc == 2) {
5946         StringValue(pad);
5947         enc = rb_enc_check(str, pad);
5948         f = RSTRING_PTR(pad);
5949         flen = RSTRING_LEN(pad);
5950         fclen = str_strlen(pad, enc);
5951         singlebyte = single_byte_optimizable(pad);
5952         if (flen == 0 || fclen == 0) {
5953             rb_raise(rb_eArgError, "zero width padding");
5954         }
5955     }
5956     len = str_strlen(str, enc);
5957     if (width < 0 || len >= width) return rb_str_dup(str);
5958     n = width - len;
5959     llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
5960     rlen = n - llen;
5961     res = rb_str_new5(str, 0, RSTRING_LEN(str)+n*flen/fclen+2);
5962     p = RSTRING_PTR(res);
5963     while (llen) {
5964         if (flen <= 1) {
5965             *p++ = *f;
5966             llen--;
5967         }
5968         else if (llen > fclen) {
5969             memcpy(p,f,flen);
5970             p += flen;
5971             llen -= fclen;
5972         }
5973         else {
5974             char *fp = str_nth(f, f+flen, llen, enc, singlebyte);
5975             n = fp - f;
5976             memcpy(p,f,n);
5977             p+=n;
5978             break;
5979         }
5980     }
5981     memcpy(p, RSTRING_PTR(str), RSTRING_LEN(str));
5982     p+=RSTRING_LEN(str);
5983     while (rlen) {
5984         if (flen <= 1) {
5985             *p++ = *f;
5986             rlen--;
5987         }
5988         else if (rlen > fclen) {
5989             memcpy(p,f,flen);
5990             p += flen;
5991             rlen -= fclen;
5992         }
5993         else {
5994             char *fp = str_nth(f, f+flen, rlen, enc, singlebyte);
5995             n = fp - f;
5996             memcpy(p,f,n);
5997             p+=n;
5998             break;
5999         }
6000     }
6001     *p = '\0';
6002     STR_SET_LEN(res, p-RSTRING_PTR(res));
6003     OBJ_INFECT(res, str);
6004     if (!NIL_P(pad)) OBJ_INFECT(res, pad);
6005     rb_enc_associate(res, enc);
6006     return res;
6007 }
6008
6009
6010 /*
6011  *  call-seq:
6012  *     str.ljust(integer, padstr=' ')   => new_str
6013  *
6014  *  If <i>integer</i> is greater than the length of <i>str</i>, returns a new
6015  *  <code>String</code> of length <i>integer</i> with <i>str</i> left justified
6016  *  and padded with <i>padstr</i>; otherwise, returns <i>str</i>.
6017  *
6018  *     "hello".ljust(4)            #=> "hello"
6019  *     "hello".ljust(20)           #=> "hello               "
6020  *     "hello".ljust(20, '1234')   #=> "hello123412341234123"
6021  */
6022
6023 static VALUE
6024 rb_str_ljust(int argc, VALUE *argv, VALUE str)
6025 {
6026     return rb_str_justify(argc, argv, str, 'l');
6027 }
6028
6029
6030 /*
6031  *  call-seq:
6032  *     str.rjust(integer, padstr=' ')   => new_str
6033  *
6034  *  If <i>integer</i> is greater than the length of <i>str</i>, returns a new
6035  *  <code>String</code> of length <i>integer</i> with <i>str</i> right justified
6036  *  and padded with <i>padstr</i>; otherwise, returns <i>str</i>.
6037  *
6038  *     "hello".rjust(4)            #=> "hello"
6039  *     "hello".rjust(20)           #=> "               hello"
6040  *     "hello".rjust(20, '1234')   #=> "123412341234123hello"
6041  */
6042
6043 static VALUE
6044 rb_str_rjust(int argc, VALUE *argv, VALUE str)
6045 {
6046     return rb_str_justify(argc, argv, str, 'r');
6047 }
6048
6049
6050 /*
6051  *  call-seq:
6052  *     str.center(integer, padstr)   => new_str
6053  *
6054  *  If <i>integer</i> is greater than the length of <i>str</i>, returns a new
6055  *  <code>String</code> of length <i>integer</i> with <i>str</i> centered and
6056  *  padded with <i>padstr</i>; otherwise, returns <i>str</i>.
6057  *
6058  *     "hello".center(4)         #=> "hello"
6059  *     "hello".center(20)        #=> "       hello        "
6060  *     "hello".center(20, '123') #=> "1231231hello12312312"
6061  */
6062
6063 static VALUE
6064 rb_str_center(int argc, VALUE *argv, VALUE str)
6065 {
6066     return rb_str_justify(argc, argv, str, 'c');
6067 }
6068
6069 /*
6070  *  call-seq:
6071  *     str.partition(sep)              => [head, sep, tail]
6072  *
6073  *  Searches the string for <i>sep</i> and returns the part before
6074  *  it, the <i>sep</i>, and the part after it.  If <i>sep</i> is not found,
6075  *  returns <i>str</i> and two empty strings.
6076  *
6077  *     "hello".partition("l")         #=> ["he", "l", "lo"]
6078  *     "hello".partition("x")         #=> ["hello", "", ""]
6079  */
6080
6081 static VALUE
6082 rb_str_partition(VALUE str, VALUE sep)
6083 {
6084     long pos;
6085     int regex = Qfalse;
6086
6087     if (TYPE(sep) == T_REGEXP) {
6088         pos = rb_reg_search(sep, str, 0, 0);
6089         regex = Qtrue;
6090     }
6091     else {
6092         VALUE tmp;
6093
6094         tmp = rb_check_string_type(sep);
6095         if (NIL_P(tmp)) {
6096             rb_raise(rb_eTypeError, "type mismatch: %s given",
6097                      rb_obj_classname(sep));
6098         }
6099         pos = rb_str_index(str, sep, 0);
6100     }
6101     if (pos < 0) {
6102       failed:
6103         return rb_ary_new3(3, str, rb_str_new(0,0),rb_str_new(0,0));
6104     }
6105     if (regex) {
6106         sep = rb_str_subpat(str, sep, 0);
6107         if (pos == 0 && RSTRING_LEN(sep) == 0) goto failed;
6108     }
6109     return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
6110                           sep,
6111                           rb_str_subseq(str, pos+RSTRING_LEN(sep),
6112                                              RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
6113 }
6114
6115 /*
6116  *  call-seq:
6117  *     str.rpartition(sep)            => [head, sep, tail]
6118  *
6119  *  Searches <i>sep</i> in the string from the end of the string, and
6120  *  returns the part before it, the <i>sep</i>, and the part after it.
6121  *  If <i>sep</i> is not found, returns two empty strings and
6122  *  <i>str</i>.
6123  *
6124  *     "hello".rpartition("l")         #=> ["hel", "l", "o"]
6125  *     "hello".rpartition("x")         #=> ["", "", "hello"]
6126  */
6127
6128 static VALUE
6129 rb_str_rpartition(VALUE str, VALUE sep)
6130 {
6131     long pos = RSTRING_LEN(str);
6132     int regex = Qfalse;
6133
6134     if (TYPE(sep) == T_REGEXP) {
6135         pos = rb_reg_search(sep, str, pos, 1);
6136         regex = Qtrue;
6137     }
6138     else {
6139         VALUE tmp;
6140
6141         tmp = rb_check_string_type(sep);
6142         if (NIL_P(tmp)) {
6143             rb_raise(rb_eTypeError, "type mismatch: %s given",
6144                      rb_obj_classname(sep));
6145         }
6146         pos = rb_str_sublen(str, pos);
6147         pos = rb_str_rindex(str, sep, pos);
6148     }
6149     if (pos < 0) {
6150         return rb_ary_new3(3, rb_str_new(0,0),rb_str_new(0,0), str);
6151     }
6152     if (regex) {
6153         sep = rb_reg_nth_match(0, rb_backref_get());
6154     }
6155     return rb_ary_new3(3, rb_str_substr(str, 0, pos),
6156                           sep,
6157                           rb_str_substr(str,pos+str_strlen(sep,STR_ENC_GET(sep)),RSTRING_LEN(str)));
6158 }
6159
6160 /*
6161  *  call-seq:
6162  *     str.start_with?([prefix]+)   => true or false
6163  *
6164  *  Returns true if <i>str</i> starts with the prefix given.
6165  */
6166
6167 static VALUE
6168 rb_str_start_with(int argc, VALUE *argv, VALUE str)
6169 {
6170     int i;
6171
6172     for (i=0; i<argc; i++) {
6173         VALUE tmp = rb_check_string_type(argv[i]);
6174         if (NIL_P(tmp)) continue;
6175         rb_enc_check(str, tmp);
6176         if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
6177         if (memcmp(RSTRING_PTR(str), RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
6178             return Qtrue;
6179     }
6180     return Qfalse;
6181 }
6182
6183 /*
6184  *  call-seq:
6185  *     str.end_with?([suffix]+)   => true or false
6186  *
6187  *  Returns true if <i>str</i> ends with the suffix given.
6188  */
6189
6190 static VALUE
6191 rb_str_end_with(int argc, VALUE *argv, VALUE str)
6192 {
6193     int i;
6194     char *p, *s;
6195     rb_encoding *enc;
6196
6197     for (i=0; i<argc; i++) {
6198         VALUE tmp = rb_check_string_type(argv[i]);
6199         if (NIL_P(tmp)) continue;
6200         enc = rb_enc_check(str, tmp);
6201         if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
6202         p = RSTRING_PTR(str);
6203         s = p + RSTRING_LEN(str) - RSTRING_LEN(tmp);
6204         if (rb_enc_left_char_head(p, s, enc) != s)
6205             continue;
6206         if (memcmp(s, RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
6207             return Qtrue;
6208     }
6209     return Qfalse;
6210 }
6211
6212 void
6213 rb_str_setter(VALUE val, ID id, VALUE *var)
6214 {
6215     if (!NIL_P(val) && TYPE(val) != T_STRING) {
6216         rb_raise(rb_eTypeError, "value of %s must be String", rb_id2name(id));
6217     }
6218     *var = val;
6219 }
6220
6221
6222 /*
6223  *  call-seq:
6224  *     str.force_encoding(encoding)   => str
6225  *
6226  *  Changes the encoding to +encoding+ and returns self.
6227  */
6228
6229 static VALUE
6230 rb_str_force_encoding(VALUE str, VALUE enc)
6231 {
6232     str_modifiable(str);
6233     rb_enc_associate(str, rb_to_encoding(enc));
6234     return str;
6235 }
6236
6237 /*
6238  *  call-seq:
6239  *     str.valid_encoding?  => true or false
6240  *
6241  *  Returns true for a string which encoded correctly.
6242  *
6243  *    "\xc2\xa1".force_encoding("UTF-8").valid_encoding? => true
6244  *    "\xc2".force_encoding("UTF-8").valid_encoding? => false
6245  *    "\x80".force_encoding("UTF-8").valid_encoding? => false
6246  */
6247
6248 static VALUE
6249 rb_str_valid_encoding_p(VALUE str)
6250 {
6251     int cr = rb_enc_str_coderange(str);
6252
6253     return cr == ENC_CODERANGE_BROKEN ? Qfalse : Qtrue;
6254 }
6255
6256 /*
6257  *  call-seq:
6258  *     str.ascii_only?  => true or false
6259  *
6260  *  Returns true for a string which has only ASCII characters.
6261  *
6262  *    "abc".force_encoding("UTF-8").ascii_only? => true
6263  *    "abc\u{6666}".force_encoding("UTF-8").ascii_only? => false
6264  */
6265
6266 static VALUE
6267 rb_str_is_ascii_only_p(VALUE str)
6268 {
6269     int cr = rb_enc_str_coderange(str);
6270
6271     return cr == ENC_CODERANGE_7BIT ? Qtrue : Qfalse;
6272 }
6273
6274 /**********************************************************************
6275  * Document-class: Symbol
6276  *
6277  *  <code>Symbol</code> objects represent names and some strings
6278  *  inside the Ruby
6279  *  interpreter. They are generated using the <code>:name</code> and
6280  *  <code>:"string"</code> literals
6281  *  syntax, and by the various <code>to_sym</code> methods. The same
6282  *  <code>Symbol</code> object will be created for a given name or string
6283  *  for the duration of a program's execution, regardless of the context
6284  *  or meaning of that name. Thus if <code>Fred</code> is a constant in
6285  *  one context, a method in another, and a class in a third, the
6286  *  <code>Symbol</code> <code>:Fred</code> will be the same object in
6287  *  all three contexts.
6288  *
6289  *     module One
6290  *       class Fred
6291  *       end
6292  *       $f1 = :Fred
6293  *     end
6294  *     module Two
6295  *       Fred = 1
6296  *       $f2 = :Fred
6297  *     end
6298  *     def Fred()
6299  *     end
6300  *     $f3 = :Fred
6301  *     $f1.object_id   #=> 2514190
6302  *     $f2.object_id   #=> 2514190
6303  *     $f3.object_id   #=> 2514190
6304  *
6305  */
6306
6307
6308 /*
6309  *  call-seq:
6310  *     sym == obj   => true or false
6311  *
6312  *  Equality---If <i>sym</i> and <i>obj</i> are exactly the same
6313  *  symbol, returns <code>true</code>. Otherwise, compares them
6314  *  as strings.
6315  */
6316
6317 static VALUE
6318 sym_equal(VALUE sym1, VALUE sym2)
6319 {
6320     if (sym1 == sym2) return Qtrue;
6321     return Qfalse;
6322 }
6323
6324
6325 /*
6326  *  call-seq:
6327  *     sym.inspect    => string
6328  *
6329  *  Returns the representation of <i>sym</i> as a symbol literal.
6330  *
6331  *     :fred.inspect   #=> ":fred"
6332  */
6333
6334 static VALUE
6335 sym_inspect(VALUE sym)
6336 {
6337     VALUE str;
6338     ID id = SYM2ID(sym);
6339     rb_encoding *enc;
6340
6341     sym = rb_id2str(id);
6342     enc = STR_ENC_GET(sym);
6343     str = rb_enc_str_new(0, RSTRING_LEN(sym)+1, enc);
6344     RSTRING_PTR(str)[0] = ':';
6345     memcpy(RSTRING_PTR(str)+1, RSTRING_PTR(sym), RSTRING_LEN(sym));
6346     if (RSTRING_LEN(sym) != strlen(RSTRING_PTR(sym)) ||
6347         !rb_enc_symname_p(RSTRING_PTR(sym), enc)) {
6348         str = rb_str_inspect(str);
6349         strncpy(RSTRING_PTR(str), ":\"", 2);
6350     }
6351     return str;
6352 }
6353
6354
6355 /*
6356  *  call-seq:
6357  *     sym.id2name   => string
6358  *     sym.to_s      => string
6359  *
6360  *  Returns the name or string corresponding to <i>sym</i>.
6361  *
6362  *     :fred.id2name   #=> "fred"
6363  */
6364
6365
6366 VALUE
6367 rb_sym_to_s(VALUE sym)
6368 {
6369     ID id = SYM2ID(sym);
6370
6371     return str_new3(rb_cString, rb_id2str(id));
6372 }
6373
6374
6375 /*
6376  * call-seq:
6377  *   sym.to_sym   => sym
6378  *   sym.intern   => sym
6379  *
6380  * In general, <code>to_sym</code> returns the <code>Symbol</code> corresponding
6381  * to an object. As <i>sym</i> is already a symbol, <code>self</code> is returned
6382  * in this case.
6383  */
6384
6385 static VALUE
6386 sym_to_sym(VALUE sym)
6387 {
6388     return sym;
6389 }
6390
6391 static VALUE
6392 sym_call(VALUE args, VALUE sym, int argc, VALUE *argv)
6393 {
6394     VALUE obj;
6395
6396     if (argc < 1) {
6397         rb_raise(rb_eArgError, "no receiver given");
6398     }
6399     obj = argv[0];
6400     return rb_funcall3(obj, (ID)sym, argc - 1, argv + 1);
6401 }
6402
6403 /*
6404  * call-seq:
6405  *   sym.to_proc
6406  *
6407  * Returns a _Proc_ object which respond to the given method by _sym_.
6408  *
6409  *   (1..3).collect(&:to_s)  #=> ["1", "2", "3"]
6410  */
6411
6412 static VALUE
6413 sym_to_proc(VALUE sym)
6414 {
6415     return rb_proc_new(sym_call, (VALUE)SYM2ID(sym));
6416 }
6417
6418
6419 static VALUE
6420 sym_succ(VALUE sym)
6421 {
6422     return rb_str_intern(rb_str_succ(rb_sym_to_s(sym)));
6423 }
6424
6425 static VALUE
6426 sym_cmp(VALUE sym, VALUE other)
6427 {
6428     if (!SYMBOL_P(other)) {
6429         return Qnil;
6430     }
6431     return rb_str_cmp_m(rb_sym_to_s(sym), rb_sym_to_s(other));
6432 }
6433
6434 static VALUE
6435 sym_casecmp(VALUE sym, VALUE other)
6436 {
6437     if (!SYMBOL_P(other)) {
6438         return Qnil;
6439     }
6440     return rb_str_casecmp(rb_sym_to_s(sym), rb_sym_to_s(other));
6441 }
6442
6443 static VALUE
6444 sym_match(VALUE sym, VALUE other)
6445 {
6446     return rb_str_match(rb_sym_to_s(sym), other);
6447 }
6448
6449 static VALUE
6450 sym_eqq(VALUE sym, VALUE other)
6451 {
6452     if (sym == other) return Qtrue;
6453     return rb_str_equal(rb_sym_to_s(sym), other);
6454 }
6455
6456 static VALUE
6457 sym_aref(int argc, VALUE *argv, VALUE sym)
6458 {
6459     return rb_str_aref_m(argc, argv, rb_sym_to_s(sym));
6460 }
6461
6462 static VALUE
6463 sym_length(VALUE sym)
6464 {
6465     return rb_str_length(rb_id2str(SYM2ID(sym)));
6466 }
6467
6468 static VALUE
6469 sym_empty(VALUE sym)
6470 {
6471     return rb_str_empty(rb_id2str(SYM2ID(sym)));
6472 }
6473
6474 static VALUE
6475 sym_upcase(VALUE sym)
6476 {
6477     return rb_str_intern(rb_str_upcase(rb_id2str(SYM2ID(sym))));
6478 }
6479
6480 static VALUE
6481 sym_downcase(VALUE sym)
6482 {
6483     return rb_str_intern(rb_str_downcase(rb_id2str(SYM2ID(sym))));
6484 }
6485
6486 static VALUE
6487 sym_capitalize(VALUE sym)
6488 {
6489     return rb_str_intern(rb_str_capitalize(rb_id2str(SYM2ID(sym))));
6490 }
6491
6492 static VALUE
6493 sym_swapcase(VALUE sym)
6494 {
6495     return rb_str_intern(rb_str_swapcase(rb_id2str(SYM2ID(sym))));
6496 }
6497
6498 static VALUE
6499 sym_encoding(VALUE sym)
6500 {
6501     return rb_obj_encoding(rb_id2str(SYM2ID(sym)));
6502 }
6503
6504 ID
6505 rb_to_id(VALUE name)
6506 {
6507     VALUE tmp;
6508     ID id;
6509
6510     switch (TYPE(name)) {
6511       default:
6512         tmp = rb_check_string_type(name);
6513         if (NIL_P(tmp)) {
6514             rb_raise(rb_eTypeError, "%s is not a symbol",
6515                      RSTRING_PTR(rb_inspect(name)));
6516         }
6517         name = tmp;
6518         /* fall through */
6519       case T_STRING:
6520         name = rb_str_intern(name);
6521         /* fall through */
6522       case T_SYMBOL:
6523         return SYM2ID(name);
6524     }
6525     return id;
6526 }
6527
6528 /*
6529  *  A <code>String</code> object holds and manipulates an arbitrary sequence of
6530  *  bytes, typically representing characters. String objects may be created
6531  *  using <code>String::new</code> or as literals.
6532  *
6533  *  Because of aliasing issues, users of strings should be aware of the methods
6534  *  that modify the contents of a <code>String</code> object.  Typically,
6535  *  methods with names ending in ``!'' modify their receiver, while those
6536  *  without a ``!'' return a new <code>String</code>.  However, there are
6537  *  exceptions, such as <code>String#[]=</code>.
6538  *
6539  */
6540
6541 void
6542 Init_String(void)
6543 {
6544 #undef rb_intern
6545
6546     rb_cString  = rb_define_class("String", rb_cObject);
6547     rb_include_module(rb_cString, rb_mComparable);
6548     rb_define_alloc_func(rb_cString, str_alloc);
6549     rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1);
6550     rb_define_method(rb_cString, "initialize", rb_str_init, -1);
6551     rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
6552     rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1);
6553     rb_define_method(rb_cString, "==", rb_str_equal, 1);
6554     rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
6555     rb_define_method(rb_cString, "hash", rb_str_hash_m, 0);
6556     rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
6557     rb_define_method(rb_cString, "+", rb_str_plus, 1);
6558     rb_define_method(rb_cString, "*", rb_str_times, 1);
6559     rb_define_method(rb_cString, "%", rb_str_format_m, 1);
6560     rb_define_method(rb_cString, "[]", rb_str_aref_m, -1);
6561     rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1);
6562     rb_define_method(rb_cString, "insert", rb_str_insert, 2);
6563     rb_define_method(rb_cString, "length", rb_str_length, 0);
6564     rb_define_method(rb_cString, "size", rb_str_length, 0);
6565     rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
6566     rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
6567     rb_define_method(rb_cString, "=~", rb_str_match, 1);
6568     rb_define_method(rb_cString, "match", rb_str_match_m, -1);
6569     rb_define_method(rb_cString, "succ", rb_str_succ, 0);
6570     rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0);
6571     rb_define_method(rb_cString, "next", rb_str_succ, 0);
6572     rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0);
6573     rb_define_method(rb_cString, "upto", rb_str_upto, -1);
6574     rb_define_method(rb_cString, "index", rb_str_index_m, -1);
6575     rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1);
6576     rb_define_method(rb_cString, "replace", rb_str_replace, 1);
6577     rb_define_method(rb_cString, "clear", rb_str_clear, 0);
6578     rb_define_method(rb_cString, "chr", rb_str_chr, 0);
6579     rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
6580     rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
6581
6582     rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
6583     rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
6584     rb_define_method(rb_cString, "to_s", rb_str_to_s, 0);
6585     rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
6586     rb_define_method(rb_cString, "inspect", rb_str_inspect, 0);
6587     rb_define_method(rb_cString, "dump", rb_str_dump, 0);
6588
6589     rb_define_method(rb_cString, "upcase", rb_str_upcase, 0);
6590     rb_define_method(rb_cString, "downcase", rb_str_downcase, 0);
6591     rb_define_method(rb_cString, "capitalize", rb_str_capitalize, 0);
6592     rb_define_method(rb_cString, "swapcase", rb_str_swapcase, 0);
6593
6594     rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, 0);
6595     rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, 0);
6596     rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, 0);
6597     rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, 0);
6598
6599     rb_define_method(rb_cString, "hex", rb_str_hex, 0);
6600     rb_define_method(rb_cString, "oct", rb_str_oct, 0);
6601     rb_define_method(rb_cString, "split", rb_str_split_m, -1);
6602     rb_define_method(rb_cString, "lines", rb_str_each_line, -1);
6603     rb_define_method(rb_cString, "bytes", rb_str_each_byte, 0);
6604     rb_define_method(rb_cString, "chars", rb_str_each_char, 0);
6605     rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
6606     rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
6607     rb_define_method(rb_cString, "concat", rb_str_concat, 1);
6608     rb_define_method(rb_cString, "<<", rb_str_concat, 1);
6609     rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
6610     rb_define_method(rb_cString, "intern", rb_str_intern, 0);
6611     rb_define_method(rb_cString, "to_sym", rb_str_intern, 0);
6612     rb_define_method(rb_cString, "ord", rb_str_ord, 0);
6613
6614     rb_define_method(rb_cString, "include?", rb_str_include, 1);
6615     rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
6616     rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
6617
6618     rb_define_method(rb_cString, "scan", rb_str_scan, 1);
6619
6620     rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
6621     rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
6622     rb_define_method(rb_cString, "center", rb_str_center, -1);
6623
6624     rb_define_method(rb_cString, "sub", rb_str_sub, -1);
6625     rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
6626     rb_define_method(rb_cString, "chop", rb_str_chop, 0);
6627     rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
6628     rb_define_method(rb_cString, "strip", rb_str_strip, 0);
6629     rb_define_method(rb_cString, "lstrip", rb_str_lstrip, 0);
6630     rb_define_method(rb_cString, "rstrip", rb_str_rstrip, 0);
6631
6632     rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1);
6633     rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1);
6634     rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0);
6635     rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1);
6636     rb_define_method(rb_cString, "strip!", rb_str_strip_bang, 0);
6637     rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, 0);
6638     rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, 0);
6639
6640     rb_define_method(rb_cString, "tr", rb_str_tr, 2);
6641     rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2);
6642     rb_define_method(rb_cString, "delete", rb_str_delete, -1);
6643     rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
6644     rb_define_method(rb_cString, "count", rb_str_count, -1);
6645
6646     rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2);
6647     rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2);
6648     rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1);
6649     rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1);
6650
6651     rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
6652     rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
6653     rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
6654
6655     rb_define_method(rb_cString, "sum", rb_str_sum, -1);
6656
6657     rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
6658     rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1);
6659
6660     rb_define_method(rb_cString, "partition", rb_str_partition, 1);
6661     rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
6662
6663     rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */
6664     rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
6665     rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
6666     rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0);
6667
6668     id_to_s = rb_intern("to_s");
6669
6670     rb_fs = Qnil;
6671     rb_define_variable("$;", &rb_fs);
6672     rb_define_variable("$-F", &rb_fs);
6673
6674     rb_cSymbol = rb_define_class("Symbol", rb_cObject);
6675     rb_include_module(rb_cSymbol, rb_mComparable);
6676     rb_undef_alloc_func(rb_cSymbol);
6677     rb_undef_method(CLASS_OF(rb_cSymbol), "new");
6678     rb_define_singleton_method(rb_cSymbol, "all_symbols", rb_sym_all_symbols, 0); /* in parse.y */
6679
6680     rb_define_method(rb_cSymbol, "==", sym_equal, 1);
6681     rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
6682     rb_define_method(rb_cSymbol, "to_s", rb_sym_to_s, 0);
6683     rb_define_method(rb_cSymbol, "id2name", rb_sym_to_s, 0);
6684     rb_define_method(rb_cSymbol, "intern", sym_to_sym, 0);
6685     rb_define_method(rb_cSymbol, "to_sym", sym_to_sym, 0);
6686     rb_define_method(rb_cSymbol, "to_proc", sym_to_proc, 0);
6687     rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
6688     rb_define_method(rb_cSymbol, "next", sym_succ, 0);
6689
6690     rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
6691     rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
6692     rb_define_method(rb_cSymbol, "=~", sym_match, 1);
6693     rb_define_method(rb_cSymbol, "===", sym_eqq, 1);
6694
6695     rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
6696     rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
6697     rb_define_method(rb_cSymbol, "length", sym_length, 0);
6698     rb_define_method(rb_cSymbol, "size", sym_length, 0);
6699     rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
6700     rb_define_method(rb_cSymbol, "match", sym_match, 1);
6701
6702     rb_define_method(rb_cSymbol, "upcase", sym_upcase, 0);
6703     rb_define_method(rb_cSymbol, "downcase", sym_downcase, 0);
6704     rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, 0);
6705     rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, 0);
6706
6707     rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);
6708 }