string.c

   1 /**********************************************************************
   2
   3   string.c -
   4
   5   $Author$
   6   created at: Mon Aug  9 17:12:58 JST 1993
   7
   8   Copyright (C) 1993-2007 Yukihiro Matsumoto
   9   Copyright (C) 2000  Network Applied Communication Laboratory, Inc.
  10   Copyright (C) 2000  Information-technology Promotion Agency, Japan
  11
  12 **********************************************************************/
  13
  14 #include "ruby/ruby.h"
  15 #include "ruby/re.h"
  16 #include "ruby/encoding.h"
  17
  18 #define BEG(no) regs->beg[no]
  19 #define END(no) regs->end[no]
  20
  21 #include <math.h>
  22 #include <ctype.h>
  23
  24 #ifdef HAVE_UNISTD_H
  25 #include <unistd.h>
  26 #endif
  27
  28 VALUE rb_cString;
  29 VALUE rb_cSymbol;
  30
  31 #define STR_TMPLOCK FL_USER7
  32 #define STR_NOEMBED FL_USER1
  33 #define STR_SHARED  FL_USER2 /* = ELTS_SHARED */
  34 #define STR_ASSOC   FL_USER3
  35 #define STR_SHARED_P(s) FL_ALL(s, STR_NOEMBED|ELTS_SHARED)
  36 #define STR_ASSOC_P(s)  FL_ALL(s, STR_NOEMBED|STR_ASSOC)
  37 #define STR_NOCAPA  (STR_NOEMBED|ELTS_SHARED|STR_ASSOC)
  38 #define STR_NOCAPA_P(s) (FL_TEST(s,STR_NOEMBED) && FL_ANY(s,ELTS_SHARED|STR_ASSOC))
  39 #define STR_UNSET_NOCAPA(s) do {\
  40     if (FL_TEST(s,STR_NOEMBED)) FL_UNSET(s,(ELTS_SHARED|STR_ASSOC));\
  41 } while (0)
  42
  43
  44 #define STR_SET_NOEMBED(str) do {\
  45     FL_SET(str, STR_NOEMBED);\
  46     STR_SET_EMBED_LEN(str, 0);\
  47 } while (0)
  48 #define STR_SET_EMBED(str) FL_UNSET(str, STR_NOEMBED)
  49 #define STR_EMBED_P(str) (!FL_TEST(str, STR_NOEMBED))
  50 #define STR_SET_EMBED_LEN(str, n) do { \
  51     long tmp_n = (n);\
  52     RBASIC(str)->flags &= ~RSTRING_EMBED_LEN_MASK;\
  53     RBASIC(str)->flags |= (tmp_n) << RSTRING_EMBED_LEN_SHIFT;\
  54 } while (0)
  55
  56 #define STR_SET_LEN(str, n) do { \
  57     if (STR_EMBED_P(str)) {\
  58         STR_SET_EMBED_LEN(str, n);\
  59     }\
  60     else {\
  61         RSTRING(str)->as.heap.len = (n);\
  62     }\
  63 } while (0)
  64
  65 #define STR_DEC_LEN(str) do {\
  66     if (STR_EMBED_P(str)) {\
  67         long n = RSTRING_LEN(str);\
  68         n--;\
  69         STR_SET_EMBED_LEN(str, n);\
  70     }\
  71     else {\
  72         RSTRING(str)->as.heap.len--;\
  73     }\
  74 } while (0)
  75
  76 #define RESIZE_CAPA(str,capacity) do {\
  77     if (STR_EMBED_P(str)) {\
  78         if ((capacity) > RSTRING_EMBED_LEN_MAX) {\
  79             char *tmp = ALLOC_N(char, capacity+1);\
  80             memcpy(tmp, RSTRING_PTR(str), RSTRING_LEN(str));\
  81             RSTRING(str)->as.heap.ptr = tmp;\
  82             RSTRING(str)->as.heap.len = RSTRING_LEN(str);\
  83             STR_SET_NOEMBED(str);\
  84             RSTRING(str)->as.heap.aux.capa = (capacity);\
  85         }\
  86     }\
  87     else {\
  88         REALLOC_N(RSTRING(str)->as.heap.ptr, char, (capacity)+1);\
  89         if (!STR_NOCAPA_P(str))\
  90             RSTRING(str)->as.heap.aux.capa = (capacity);\
  91     }\
  92 } while (0)
  93
  94 #define is_ascii_string(str) (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT)
  95 #define is_broken_string(str) (rb_enc_str_coderange(str) == ENC_CODERANGE_BROKEN)
  96
  97 #define STR_ENC_GET(str) rb_enc_from_index(ENCODING_GET(str))
  98
  99 static int
 100 single_byte_optimizable(VALUE str)
 101 {
 102     rb_encoding *enc = STR_ENC_GET(str);
 103
 104     if (rb_enc_mbmaxlen(enc) == 1)
 105         return 1;
 106
 107     /* Conservative.  It may be ENC_CODERANGE_UNKNOWN. */
 108     if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT)
 109         return 1;
 110
 111     /* Conservative.  Possibly single byte.
 112      * "\xa1" in Shift_JIS for example. */
 113     return 0;
 114 }
 115
 116 VALUE rb_fs;
 117
 118 static inline const char *
 119 search_nonascii(const char *p, const char *e)
 120 {
 121 #if SIZEOF_VALUE == 8
 122 # define NONASCII_MASK 0x8080808080808080LL
 123 #elif SIZEOF_VALUE == 4
 124 # define NONASCII_MASK 0x80808080UL
 125 #endif
 126 #ifdef NONASCII_MASK
 127     if (sizeof(VALUE) * 2 < e - p) {
 128         const VALUE *s, *t;
 129         const VALUE lowbits = sizeof(VALUE) - 1;
 130         s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
 131         while (p < (const char *)s) {
 132             if (!ISASCII(*p))
 133                 return p;
 134             p++;
 135         }
 136         t = (const VALUE*)(~lowbits & (VALUE)e);
 137         while (s < t) {
 138             if (*s & NONASCII_MASK) {
 139                 t = s;
 140                 break;
 141             }
 142             s++;
 143         }
 144         p = (const char *)t;
 145     }
 146 #endif
 147     while (p < e) {
 148         if (!ISASCII(*p))
 149             return p;
 150         p++;
 151     }
 152     return NULL;
 153 }
 154
 155 static int
 156 coderange_scan(const char *p, long len, rb_encoding *enc)
 157 {
 158     const char *e = p + len;
 159
 160     if (rb_enc_to_index(enc) == 0) {
 161         /* enc is ASCII-8BIT.  ASCII-8BIT string never be broken. */
 162         p = search_nonascii(p, e);
 163         return p ? ENC_CODERANGE_VALID : ENC_CODERANGE_7BIT;
 164     }
 165
 166     if (rb_enc_asciicompat(enc)) {
 167         p = search_nonascii(p, e);
 168         if (!p) {
 169             return ENC_CODERANGE_7BIT;
 170         }
 171         while (p < e) {
 172             int ret = rb_enc_precise_mbclen(p, e, enc);
 173             if (!MBCLEN_CHARFOUND_P(ret)) {
 174                 return ENC_CODERANGE_BROKEN;
 175             }
 176             p += MBCLEN_CHARFOUND_LEN(ret);
 177             if (p < e) {
 178                 p = search_nonascii(p, e);
 179                 if (!p) {
 180                     return ENC_CODERANGE_VALID;
 181                 }
 182             }
 183         }
 184         if (e < p) {
 185             return ENC_CODERANGE_BROKEN;
 186         }
 187         return ENC_CODERANGE_VALID;
 188     }
 189
 190     while (p < e) {
 191         int ret = rb_enc_precise_mbclen(p, e, enc);
 192
 193         if (!MBCLEN_CHARFOUND_P(ret)) {
 194             return ENC_CODERANGE_BROKEN;
 195         }
 196         p += MBCLEN_CHARFOUND_LEN(ret);
 197     }
 198     if (e < p) {
 199         return ENC_CODERANGE_BROKEN;
 200     }
 201     return ENC_CODERANGE_VALID;
 202 }
 203
 204 long
 205 rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
 206 {
 207     const char *p = s;
 208
 209     if (*cr == ENC_CODERANGE_BROKEN)
 210         return e - s;
 211
 212     if (rb_enc_to_index(enc) == 0) {
 213         /* enc is ASCII-8BIT.  ASCII-8BIT string never be broken. */
 214         p = search_nonascii(p, e);
 215         *cr = (!p && *cr != ENC_CODERANGE_VALID) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
 216         return e - s;
 217     }
 218     else if (rb_enc_asciicompat(enc)) {
 219         p = search_nonascii(p, e);
 220         if (!p) {
 221             if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
 222             return e - s;
 223         }
 224         while (p < e) {
 225             int ret = rb_enc_precise_mbclen(p, e, enc);
 226             if (!MBCLEN_CHARFOUND_P(ret)) {
 227                 *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN;
 228                 return p - s;
 229             }
 230             p += MBCLEN_CHARFOUND_LEN(ret);
 231             if (p < e) {
 232                 p = search_nonascii(p, e);
 233                 if (!p) {
 234                     *cr = ENC_CODERANGE_VALID;
 235                     return e - s;
 236                 }
 237             }
 238         }
 239         *cr = e < p ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_VALID;
 240         return p - s;
 241     }
 242     else {
 243         while (p < e) {
 244             int ret = rb_enc_precise_mbclen(p, e, enc);
 245             if (!MBCLEN_CHARFOUND_P(ret)) {
 246                 *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN;
 247                 return p - s;
 248             }
 249             p += MBCLEN_CHARFOUND_LEN(ret);
 250         }
 251         *cr = e < p ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_VALID;
 252         return p - s;
 253     }
 254 }
 255
 256 static inline void
 257 str_enc_copy(VALUE str1, VALUE str2)
 258 {
 259     rb_enc_internal_set_index(str1, ENCODING_GET(str2));
 260 }
 261
 262 static void
 263 rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
 264 {
 265     /* this function is designed for copying encoding and coderange
 266      * from src to new string "dest" which is made from the part of src.
 267      */
 268     str_enc_copy(dest, src);
 269     switch (ENC_CODERANGE(src)) {
 270       case ENC_CODERANGE_7BIT:
 271         ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
 272         break;
 273       case ENC_CODERANGE_VALID:
 274         if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
 275             search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
 276             ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID);
 277         else
 278             ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
 279         break;
 280       default:
 281         if (RSTRING_LEN(dest) == 0) {
 282             if (!rb_enc_asciicompat(STR_ENC_GET(src)))
 283                 ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID);
 284             else
 285                 ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
 286         }
 287         break;
 288     }
 289 }
 290
 291 static void
 292 rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
 293 {
 294     str_enc_copy(dest, src);
 295     ENC_CODERANGE_SET(dest, ENC_CODERANGE(src));
 296 }
 297
 298 int
 299 rb_enc_str_coderange(VALUE str)
 300 {
 301     int cr = ENC_CODERANGE(str);
 302
 303     if (cr == ENC_CODERANGE_UNKNOWN) {
 304         rb_encoding *enc = STR_ENC_GET(str);
 305         cr = coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
 306         ENC_CODERANGE_SET(str, cr);
 307     }
 308     return cr;
 309 }
 310
 311 int
 312 rb_enc_str_asciionly_p(VALUE str)
 313 {
 314     rb_encoding *enc = STR_ENC_GET(str);
 315
 316     if (!rb_enc_asciicompat(enc))
 317         return Qfalse;
 318     else if (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT)
 319         return Qtrue;
 320     return Qfalse;
 321 }
 322
 323 static inline void
 324 str_mod_check(VALUE s, const char *p, long len)
 325 {
 326     if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
 327         rb_raise(rb_eRuntimeError, "string modified");
 328     }
 329 }
 330
 331 static inline void
 332 str_frozen_check(VALUE s)
 333 {
 334     if (OBJ_FROZEN(s)) {
 335         rb_raise(rb_eRuntimeError, "string frozen");
 336     }
 337 }
 338
 339 static VALUE
 340 str_alloc(VALUE klass)
 341 {
 342     NEWOBJ(str, struct RString);
 343     OBJSETUP(str, klass, T_STRING);
 344
 345     if (klass == rb_cSymbol) {
 346         /* need to be registered in table */
 347         RBASIC(str)->klass = rb_cString;
 348     }
 349     str->as.heap.ptr = 0;
 350     str->as.heap.len = 0;
 351     str->as.heap.aux.capa = 0;
 352
 353     return (VALUE)str;
 354 }
 355
 356 static VALUE
 357 str_new(VALUE klass, const char *ptr, long len)
 358 {
 359     VALUE str;
 360
 361     if (len < 0) {
 362         rb_raise(rb_eArgError, "negative string size (or size too big)");
 363     }
 364
 365     str = str_alloc(klass);
 366     if (len > RSTRING_EMBED_LEN_MAX) {
 367         RSTRING(str)->as.heap.aux.capa = len;
 368         RSTRING(str)->as.heap.ptr = ALLOC_N(char,len+1);
 369         STR_SET_NOEMBED(str);
 370     }
 371     if (ptr) {
 372         memcpy(RSTRING_PTR(str), ptr, len);
 373     }
 374     STR_SET_LEN(str, len);
 375     RSTRING_PTR(str)[len] = '\0';
 376     return str;
 377 }
 378
 379 VALUE
 380 rb_str_new(const char *ptr, long len)
 381 {
 382     return str_new(rb_cString, ptr, len);
 383 }
 384
 385 VALUE
 386 rb_usascii_str_new(const char *ptr, long len)
 387 {
 388     VALUE str = str_new(rb_cString, ptr, len);
 389
 390     ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
 391     return str;
 392 }
 393
 394 VALUE
 395 rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
 396 {
 397     VALUE str = str_new(rb_cString, ptr, len);
 398
 399     rb_enc_associate(str, enc);
 400     return str;
 401 }
 402
 403 VALUE
 404 rb_str_new2(const char *ptr)
 405 {
 406     if (!ptr) {
 407         rb_raise(rb_eArgError, "NULL pointer given");
 408     }
 409     return rb_str_new(ptr, strlen(ptr));
 410 }
 411
 412 VALUE
 413 rb_usascii_str_new2(const char *ptr)
 414 {
 415     if (!ptr) {
 416         rb_raise(rb_eArgError, "NULL pointer given");
 417     }
 418     return rb_usascii_str_new(ptr, strlen(ptr));
 419 }
 420
 421 VALUE
 422 rb_tainted_str_new(const char *ptr, long len)
 423 {
 424     VALUE str = rb_str_new(ptr, len);
 425
 426     OBJ_TAINT(str);
 427     return str;
 428 }
 429
 430 VALUE
 431 rb_tainted_str_new2(const char *ptr)
 432 {
 433     VALUE str = rb_str_new2(ptr);
 434
 435     OBJ_TAINT(str);
 436     return str;
 437 }
 438
 439 static VALUE
 440 str_replace_shared(VALUE str2, VALUE str)
 441 {
 442     if (RSTRING_LEN(str) <= RSTRING_EMBED_LEN_MAX) {
 443         STR_SET_EMBED(str2);
 444         memcpy(RSTRING_PTR(str2), RSTRING_PTR(str), RSTRING_LEN(str)+1);
 445         STR_SET_EMBED_LEN(str2, RSTRING_LEN(str));
 446     }
 447     else {
 448         FL_SET(str2, STR_NOEMBED);
 449         RSTRING(str2)->as.heap.len = RSTRING_LEN(str);
 450         RSTRING(str2)->as.heap.ptr = RSTRING_PTR(str);
 451         RSTRING(str2)->as.heap.aux.shared = str;
 452         FL_SET(str2, ELTS_SHARED);
 453     }
 454     rb_enc_cr_str_exact_copy(str2, str);
 455
 456     return str2;
 457 }
 458
 459 static VALUE
 460 str_new_shared(VALUE klass, VALUE str)
 461 {
 462     return str_replace_shared(str_alloc(klass), str);
 463 }
 464
 465 static VALUE
 466 str_new3(VALUE klass, VALUE str)
 467 {
 468     return str_new_shared(klass, str);
 469 }
 470
 471 VALUE
 472 rb_str_new3(VALUE str)
 473 {
 474     VALUE str2 = str_new3(rb_obj_class(str), str);
 475
 476     OBJ_INFECT(str2, str);
 477     return str2;
 478 }
 479
 480 static VALUE
 481 str_new4(VALUE klass, VALUE str)
 482 {
 483     VALUE str2;
 484
 485     str2 = str_alloc(klass);
 486     STR_SET_NOEMBED(str2);
 487     RSTRING(str2)->as.heap.len = RSTRING_LEN(str);
 488     RSTRING(str2)->as.heap.ptr = RSTRING_PTR(str);
 489     if (STR_SHARED_P(str)) {
 490         FL_SET(str2, ELTS_SHARED);
 491         RSTRING(str2)->as.heap.aux.shared = RSTRING(str)->as.heap.aux.shared;
 492     }
 493     else {
 494         FL_SET(str, ELTS_SHARED);
 495         RSTRING(str)->as.heap.aux.shared = str2;
 496     }
 497     rb_enc_cr_str_exact_copy(str2, str);
 498     OBJ_INFECT(str2, str);
 499     return str2;
 500 }
 501
 502 VALUE
 503 rb_str_new4(VALUE orig)
 504 {
 505     VALUE klass, str;
 506
 507     if (OBJ_FROZEN(orig)) return orig;
 508     klass = rb_obj_class(orig);
 509     if (STR_SHARED_P(orig) && (str = RSTRING(orig)->as.heap.aux.shared)) {
 510         long ofs;
 511         ofs = RSTRING_LEN(str) - RSTRING_LEN(orig);
 512         if ((ofs > 0) || (klass != RBASIC(str)->klass) ||
 513             (!OBJ_TAINTED(str) && OBJ_TAINTED(orig))) {
 514             str = str_new3(klass, str);
 515             RSTRING(str)->as.heap.ptr += ofs;
 516             RSTRING(str)->as.heap.len -= ofs;
 517         }
 518         rb_enc_cr_str_exact_copy(str, orig);
 519         OBJ_INFECT(str, orig);
 520     }
 521     else if (STR_EMBED_P(orig)) {
 522         str = str_new(klass, RSTRING_PTR(orig), RSTRING_LEN(orig));
 523         rb_enc_cr_str_exact_copy(str, orig);
 524         OBJ_INFECT(str, orig);
 525     }
 526     else if (STR_ASSOC_P(orig)) {
 527         VALUE assoc = RSTRING(orig)->as.heap.aux.shared;
 528         FL_UNSET(orig, STR_ASSOC);
 529         str = str_new4(klass, orig);
 530         FL_SET(str, STR_ASSOC);
 531         RSTRING(str)->as.heap.aux.shared = assoc;
 532     }
 533     else {
 534         str = str_new4(klass, orig);
 535     }
 536     OBJ_FREEZE(str);
 537     return str;
 538 }
 539
 540 VALUE
 541 rb_str_new5(VALUE obj, const char *ptr, long len)
 542 {
 543     return str_new(rb_obj_class(obj), ptr, len);
 544 }
 545
 546 #define STR_BUF_MIN_SIZE 128
 547
 548 VALUE
 549 rb_str_buf_new(long capa)
 550 {
 551     VALUE str = str_alloc(rb_cString);
 552
 553     if (capa < STR_BUF_MIN_SIZE) {
 554         capa = STR_BUF_MIN_SIZE;
 555     }
 556     FL_SET(str, STR_NOEMBED);
 557     RSTRING(str)->as.heap.aux.capa = capa;
 558     RSTRING(str)->as.heap.ptr = ALLOC_N(char, capa+1);
 559     RSTRING(str)->as.heap.ptr[0] = '\0';
 560
 561     return str;
 562 }
 563
 564 VALUE
 565 rb_str_buf_new2(const char *ptr)
 566 {
 567     VALUE str;
 568     long len = strlen(ptr);
 569
 570     str = rb_str_buf_new(len);
 571     rb_str_buf_cat(str, ptr, len);
 572
 573     return str;
 574 }
 575
 576 VALUE
 577 rb_str_tmp_new(long len)
 578 {
 579     return str_new(0, 0, len);
 580 }
 581
 582 void
 583 rb_str_free(VALUE str)
 584 {
 585     if (!STR_EMBED_P(str) && !STR_SHARED_P(str)) {
 586         xfree(RSTRING(str)->as.heap.ptr);
 587     }
 588 }
 589
 590 VALUE
 591 rb_str_to_str(VALUE str)
 592 {
 593     return rb_convert_type(str, T_STRING, "String", "to_str");
 594 }
 595
 596 void
 597 rb_str_shared_replace(VALUE str, VALUE str2)
 598 {
 599     rb_encoding *enc;
 600     int cr;
 601     if (str == str2) return;
 602     enc = STR_ENC_GET(str2);
 603     cr = ENC_CODERANGE(str2);
 604     rb_str_modify(str);
 605     if (OBJ_TAINTED(str2)) OBJ_TAINT(str);
 606     if (RSTRING_LEN(str2) <= RSTRING_EMBED_LEN_MAX) {
 607         STR_SET_EMBED(str);
 608         memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), RSTRING_LEN(str2)+1);
 609         STR_SET_EMBED_LEN(str, RSTRING_LEN(str2));
 610         rb_enc_associate(str, enc);
 611         ENC_CODERANGE_SET(str, cr);
 612         return;
 613     }
 614     if (!STR_SHARED_P(str) && !STR_EMBED_P(str)) {
 615         free(RSTRING_PTR(str));
 616     }
 617     STR_SET_NOEMBED(str);
 618     STR_UNSET_NOCAPA(str);
 619     RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
 620     RSTRING(str)->as.heap.len = RSTRING_LEN(str2);
 621     if (STR_NOCAPA_P(str2)) {
 622         FL_SET(str, RBASIC(str2)->flags & STR_NOCAPA);
 623         RSTRING(str)->as.heap.aux.shared = RSTRING(str2)->as.heap.aux.shared;
 624     }
 625     else {
 626         RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
 627     }
 628     RSTRING(str2)->as.heap.ptr = 0;     /* abandon str2 */
 629     RSTRING(str2)->as.heap.len = 0;
 630     RSTRING(str2)->as.heap.aux.capa = 0;
 631     STR_UNSET_NOCAPA(str2);
 632     rb_enc_associate(str, enc);
 633     ENC_CODERANGE_SET(str, cr);
 634 }
 635
 636 static ID id_to_s;
 637
 638 VALUE
 639 rb_obj_as_string(VALUE obj)
 640 {
 641     VALUE str;
 642
 643     if (TYPE(obj) == T_STRING) {
 644         return obj;
 645     }
 646     str = rb_funcall(obj, id_to_s, 0);
 647     if (TYPE(str) != T_STRING)
 648         return rb_any_to_s(obj);
 649     if (OBJ_TAINTED(obj)) OBJ_TAINT(str);
 650     return str;
 651 }
 652
 653 static VALUE rb_str_replace(VALUE, VALUE);
 654
 655 VALUE
 656 rb_str_dup(VALUE str)
 657 {
 658     VALUE dup = str_alloc(rb_obj_class(str));
 659     rb_str_replace(dup, str);
 660     return dup;
 661 }
 662
 663
 664 /*
 665  *  call-seq:
 666  *     String.new(str="")   => new_str
 667  *
 668  *  Returns a new string object containing a copy of <i>str</i>.
 669  */
 670
 671 static VALUE
 672 rb_str_init(int argc, VALUE *argv, VALUE str)
 673 {
 674     VALUE orig;
 675
 676     if (argc > 0 && rb_scan_args(argc, argv, "01", &orig) == 1)
 677         rb_str_replace(str, orig);
 678     return str;
 679 }
 680
 681 long
 682 rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
 683 {
 684     long c;
 685     const char *q;
 686
 687     if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
 688         return (e - p + rb_enc_mbminlen(enc) - 1) / rb_enc_mbminlen(enc);
 689     }
 690     else if (rb_enc_asciicompat(enc)) {
 691         c = 0;
 692         while (p < e) {
 693             if (ISASCII(*p)) {
 694                 q = search_nonascii(p, e);
 695                 if (!q)
 696                     return c + (e - p);
 697                 c += q - p;
 698                 p = q;
 699             }
 700             p += rb_enc_mbclen(p, e, enc);
 701             c++;
 702         }
 703         return c;
 704     }
 705
 706     for (c=0; p<e; c++) {
 707         p += rb_enc_mbclen(p, e, enc);
 708     }
 709     return c;
 710 }
 711
 712 long
 713 rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
 714 {
 715     long c;
 716     const char *q;
 717     int ret;
 718
 719     *cr = 0;
 720     if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
 721         return (e - p + rb_enc_mbminlen(enc) - 1) / rb_enc_mbminlen(enc);
 722     }
 723     else if (rb_enc_asciicompat(enc)) {
 724         c = 0;
 725         while (p < e) {
 726             if (ISASCII(*p)) {
 727                 q = search_nonascii(p, e);
 728                 if (!q) {
 729                     return c + (e - p);
 730                 }
 731                 c += q - p;
 732                 p = q;
 733             }
 734             ret = rb_enc_precise_mbclen(p, e, enc);
 735             if (MBCLEN_CHARFOUND_P(ret)) {
 736                 *cr |= ENC_CODERANGE_VALID;
 737                 p += MBCLEN_CHARFOUND_LEN(ret);
 738             }
 739             else {
 740                 *cr = ENC_CODERANGE_BROKEN;
 741                 p++;
 742             }
 743             c++;
 744         }
 745         if (!*cr) *cr = ENC_CODERANGE_7BIT;
 746         return c;
 747     }
 748
 749     for (c=0; p<e; c++) {
 750         ret = rb_enc_precise_mbclen(p, e, enc);
 751         if (MBCLEN_CHARFOUND_P(ret)) {
 752             *cr |= ENC_CODERANGE_VALID;
 753             p += MBCLEN_CHARFOUND_LEN(ret);
 754         }
 755         else {
 756             *cr = ENC_CODERANGE_BROKEN;
 757             p++;
 758         }
 759     }
 760     if (!*cr) *cr = ENC_CODERANGE_7BIT;
 761     return c;
 762 }
 763
 764 #ifdef NONASCII_MASK
 765 #define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
 766 static inline VALUE
 767 count_utf8_lead_bytes_with_word(const VALUE *s)
 768 {
 769     VALUE d = *s;
 770     d |= ~(d>>1);
 771     d >>= 6;
 772     d &= NONASCII_MASK >> 7;
 773     d += (d>>8);
 774     d += (d>>16);
 775 #if SIZEOF_VALUE == 8
 776     d += (d>>32);
 777 #endif
 778     return (d&0xF);
 779 }
 780 #endif
 781
 782 static long
 783 str_strlen(VALUE str, rb_encoding *enc)
 784 {
 785     const char *p, *e;
 786     int n, cr;
 787
 788     if (single_byte_optimizable(str)) return RSTRING_LEN(str);
 789     if (!enc) enc = STR_ENC_GET(str);
 790     p = RSTRING_PTR(str);
 791     e = RSTRING_END(str);
 792 #ifdef NONASCII_MASK
 793     if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
 794         enc == rb_utf8_encoding()) {
 795         VALUE len = 0;
 796         if (sizeof(VALUE) * 2 < e - p) {
 797             const VALUE *s, *t;
 798             const VALUE lowbits = sizeof(VALUE) - 1;
 799             s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
 800             t = (const VALUE*)(~lowbits & (VALUE)e);
 801             while (p < (const char *)s) {
 802                 if (is_utf8_lead_byte(*p)) len++;
 803                 p++;
 804             }
 805             while (s < t) {
 806                 len += count_utf8_lead_bytes_with_word(s);
 807                 s++;
 808             }
 809             p = (const char *)s;
 810         }
 811         while (p < e) {
 812             if (is_utf8_lead_byte(*p)) len++;
 813             p++;
 814         }
 815         return (long)len;
 816     }
 817 #endif
 818     n = rb_enc_strlen_cr(p, e, enc, &cr);
 819     if (cr) {
 820         ENC_CODERANGE_SET(str, cr);
 821     }
 822     return n;
 823 }
 824
 825 /*
 826  *  call-seq:
 827  *     str.length   => integer
 828  *     str.size     => integer
 829  *
 830  *  Returns the character length of <i>str</i>.
 831  */
 832
 833 VALUE
 834 rb_str_length(VALUE str)
 835 {
 836     int len;
 837
 838     len = str_strlen(str, STR_ENC_GET(str));
 839     return INT2NUM(len);
 840 }
 841
 842 /*
 843  *  call-seq:
 844  *     str.bytesize  => integer
 845  *
 846  *  Returns the length of <i>str</i> in bytes.
 847  */
 848
 849 static VALUE
 850 rb_str_bytesize(VALUE str)
 851 {
 852     return INT2NUM(RSTRING_LEN(str));
 853 }
 854
 855 /*
 856  *  call-seq:
 857  *     str.empty?   => true or false
 858  *
 859  *  Returns <code>true</code> if <i>str</i> has a length of zero.
 860  *
 861  *     "hello".empty?   #=> false
 862  *     "".empty?        #=> true
 863  */
 864
 865 static VALUE
 866 rb_str_empty(VALUE str)
 867 {
 868     if (RSTRING_LEN(str) == 0)
 869         return Qtrue;
 870     return Qfalse;
 871 }
 872
 873 /*
 874  *  call-seq:
 875  *     str + other_str   => new_str
 876  *
 877  *  Concatenation---Returns a new <code>String</code> containing
 878  *  <i>other_str</i> concatenated to <i>str</i>.
 879  *
 880  *     "Hello from " + self.to_s   #=> "Hello from main"
 881  */
 882
 883 VALUE
 884 rb_str_plus(VALUE str1, VALUE str2)
 885 {
 886     VALUE str3;
 887     rb_encoding *enc;
 888
 889     StringValue(str2);
 890     enc = rb_enc_check(str1, str2);
 891     str3 = rb_str_new(0, RSTRING_LEN(str1)+RSTRING_LEN(str2));
 892     memcpy(RSTRING_PTR(str3), RSTRING_PTR(str1), RSTRING_LEN(str1));
 893     memcpy(RSTRING_PTR(str3) + RSTRING_LEN(str1),
 894            RSTRING_PTR(str2), RSTRING_LEN(str2));
 895     RSTRING_PTR(str3)[RSTRING_LEN(str3)] = '\0';
 896
 897     if (OBJ_TAINTED(str1) || OBJ_TAINTED(str2))
 898         OBJ_TAINT(str3);
 899     ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc),
 900                            ENC_CODERANGE_AND(ENC_CODERANGE(str1), ENC_CODERANGE(str2)));
 901     return str3;
 902 }
 903
 904 /*
 905  *  call-seq:
 906  *     str * integer   => new_str
 907  *
 908  *  Copy---Returns a new <code>String</code> containing <i>integer</i> copies of
 909  *  the receiver.
 910  *
 911  *     "Ho! " * 3   #=> "Ho! Ho! Ho! "
 912  */
 913
 914 VALUE
 915 rb_str_times(VALUE str, VALUE times)
 916 {
 917     VALUE str2;
 918     long n, len;
 919
 920     len = NUM2LONG(times);
 921     if (len < 0) {
 922         rb_raise(rb_eArgError, "negative argument");
 923     }
 924     if (len && LONG_MAX/len <  RSTRING_LEN(str)) {
 925         rb_raise(rb_eArgError, "argument too big");
 926     }
 927
 928     str2 = rb_str_new5(str, 0, len *= RSTRING_LEN(str));
 929     if (len) {
 930         n = RSTRING_LEN(str);
 931         memcpy(RSTRING_PTR(str2), RSTRING_PTR(str), n);
 932         while (n <= len/2) {
 933             memcpy(RSTRING_PTR(str2) + n, RSTRING_PTR(str2), n);
 934             n *= 2;
 935         }
 936         memcpy(RSTRING_PTR(str2) + n, RSTRING_PTR(str2), len-n);
 937     }
 938     RSTRING_PTR(str2)[RSTRING_LEN(str2)] = '\0';
 939     OBJ_INFECT(str2, str);
 940     rb_enc_cr_str_copy_for_substr(str2, str);
 941
 942     return str2;
 943 }
 944
 945 /*
 946  *  call-seq:
 947  *     str % arg   => new_str
 948  *
 949  *  Format---Uses <i>str</i> as a format specification, and returns the result
 950  *  of applying it to <i>arg</i>. If the format specification contains more than
 951  *  one substitution, then <i>arg</i> must be an <code>Array</code> containing
 952  *  the values to be substituted. See <code>Kernel::sprintf</code> for details
 953  *  of the format string.
 954  *
 955  *     "%05d" % 123                              #=> "00123"
 956  *     "%-5s: %08x" % [ "ID", self.object_id ]   #=> "ID   : 200e14d6"
 957  */
 958
 959 static VALUE
 960 rb_str_format_m(VALUE str, VALUE arg)
 961 {
 962     VALUE tmp = rb_check_array_type(arg);
 963
 964     if (!NIL_P(tmp)) {
 965         return rb_str_format(RARRAY_LEN(tmp), RARRAY_PTR(tmp), str);
 966     }
 967     return rb_str_format(1, &arg, str);
 968 }
 969
 970 static inline void
 971 str_modifiable(VALUE str)
 972 {
 973     if (FL_TEST(str, STR_TMPLOCK)) {
 974         rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
 975     }
 976     if (OBJ_FROZEN(str)) rb_error_frozen("string");
 977     if (!OBJ_TAINTED(str) && rb_safe_level() >= 4)
 978         rb_raise(rb_eSecurityError, "Insecure: can't modify string");
 979 }
 980
 981 static inline int
 982 str_independent(VALUE str)
 983 {
 984     str_modifiable(str);
 985     if (!STR_SHARED_P(str)) return 1;
 986     if (STR_EMBED_P(str)) return 1;
 987     return 0;
 988 }
 989
 990 static void
 991 str_make_independent(VALUE str)
 992 {
 993     char *ptr;
 994     long len = RSTRING_LEN(str);
 995
 996     ptr = ALLOC_N(char, len+1);
 997     if (RSTRING_PTR(str)) {
 998         memcpy(ptr, RSTRING_PTR(str), len);
 999     }
1000     STR_SET_NOEMBED(str);
1001     ptr[len] = 0;
1002     RSTRING(str)->as.heap.ptr = ptr;
1003     RSTRING(str)->as.heap.len = len;
1004     RSTRING(str)->as.heap.aux.capa = len;
1005     STR_UNSET_NOCAPA(str);
1006 }
1007
1008 void
1009 rb_str_modify(VALUE str)
1010 {
1011     if (!str_independent(str))
1012         str_make_independent(str);
1013     ENC_CODERANGE_CLEAR(str);
1014 }
1015
1016 void
1017 rb_str_associate(VALUE str, VALUE add)
1018 {
1019     /* sanity check */
1020     if (OBJ_FROZEN(str)) rb_error_frozen("string");
1021     if (STR_ASSOC_P(str)) {
1022         /* already associated */
1023         rb_ary_concat(RSTRING(str)->as.heap.aux.shared, add);
1024     }
1025     else {
1026         if (STR_SHARED_P(str)) {
1027             VALUE assoc = RSTRING(str)->as.heap.aux.shared;
1028             str_make_independent(str);
1029             if (STR_ASSOC_P(assoc)) {
1030                 assoc = RSTRING(assoc)->as.heap.aux.shared;
1031                 rb_ary_concat(assoc, add);
1032                 add = assoc;
1033             }
1034         }
1035         else if (STR_EMBED_P(str)) {
1036             str_make_independent(str);
1037         }
1038         else if (RSTRING(str)->as.heap.aux.capa != RSTRING_LEN(str)) {
1039             RESIZE_CAPA(str, RSTRING_LEN(str));
1040         }
1041         FL_SET(str, STR_ASSOC);
1042         RBASIC(add)->klass = 0;
1043         RSTRING(str)->as.heap.aux.shared = add;
1044     }
1045 }
1046
1047 VALUE
1048 rb_str_associated(VALUE str)
1049 {
1050     if (STR_SHARED_P(str)) str = RSTRING(str)->as.heap.aux.shared;
1051     if (STR_ASSOC_P(str)) {
1052         return RSTRING(str)->as.heap.aux.shared;
1053     }
1054     return Qfalse;
1055 }
1056
1057 VALUE
1058 rb_string_value(volatile VALUE *ptr)
1059 {
1060     VALUE s = *ptr;
1061     if (TYPE(s) != T_STRING) {
1062         s = rb_str_to_str(s);
1063         *ptr = s;
1064     }
1065     return s;
1066 }
1067
1068 char *
1069 rb_string_value_ptr(volatile VALUE *ptr)
1070 {
1071     return RSTRING_PTR(rb_string_value(ptr));
1072 }
1073
1074 char *
1075 rb_string_value_cstr(volatile VALUE *ptr)
1076 {
1077     VALUE str = rb_string_value(ptr);
1078     char *s = RSTRING_PTR(str);
1079
1080     if (!s || RSTRING_LEN(str) != strlen(s)) {
1081         rb_raise(rb_eArgError, "string contains null byte");
1082     }
1083     return s;
1084 }
1085
1086 VALUE
1087 rb_check_string_type(VALUE str)
1088 {
1089     str = rb_check_convert_type(str, T_STRING, "String", "to_str");
1090     return str;
1091 }
1092
1093 /*
1094  *  call-seq:
1095  *     String.try_convert(obj) -> string or nil
1096  *
1097  *  Try to convert <i>obj</i> into a String, using to_str method.
1098  *  Returns converted regexp or nil if <i>obj</i> cannot be converted
1099  *  for any reason.
1100  *
1101  *     String.try_convert("str")     # => str
1102  *     String.try_convert(/re/)      # => nil
1103  */
1104 static VALUE
1105 rb_str_s_try_convert(VALUE dummy, VALUE str)
1106 {
1107     return rb_check_string_type(str);
1108 }
1109
1110 char*
1111 rb_enc_nth(const char *p, const char *e, int nth, rb_encoding *enc)
1112 {
1113     if (rb_enc_mbmaxlen(enc) == 1) {
1114         p += nth;
1115     }
1116     else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
1117         p += nth * rb_enc_mbmaxlen(enc);
1118     }
1119     else if (rb_enc_asciicompat(enc)) {
1120         const char *p2, *e2;
1121         int n;
1122
1123         while (p < e && 0 < nth) {
1124             e2 = p + nth;
1125             if (e < e2)
1126                 return (char *)e;
1127             if (ISASCII(*p)) {
1128                 p2 = search_nonascii(p, e2);
1129                 if (!p2)
1130                     return (char *)e2;
1131                 nth -= p2 - p;
1132                 p = p2;
1133             }
1134             n = rb_enc_mbclen(p, e, enc);
1135             p += n;
1136             nth--;
1137         }
1138         if (nth != 0)
1139             return (char *)e;
1140         return (char *)p;
1141     }
1142     else {
1143         while (p<e && nth--) {
1144             p += rb_enc_mbclen(p, e, enc);
1145         }
1146     }
1147     if (p > e) p = e;
1148     return (char*)p;
1149 }
1150
1151 static char*
1152 str_nth(const char *p, const char *e, int nth, rb_encoding *enc, int singlebyte)
1153 {
1154     if (singlebyte)
1155         p += nth;
1156     else {
1157         p = rb_enc_nth(p, e, nth, enc);
1158     }
1159     if (!p) return 0;
1160     if (p > e) p = e;
1161     return (char *)p;
1162 }
1163
1164 /* char offset to byte offset */
1165 static int
1166 str_offset(const char *p, const char *e, int nth, rb_encoding *enc, int singlebyte)
1167 {
1168     const char *pp = str_nth(p, e, nth, enc, singlebyte);
1169     if (!pp) return e - p;
1170     return pp - p;
1171 }
1172
1173 #ifdef NONASCII_MASK
1174 static char *
1175 str_utf8_nth(const char *p, const char *e, int nth)
1176 {
1177     if (sizeof(VALUE) * 2 < nth) {
1178         const VALUE *s, *t;
1179         const VALUE lowbits = sizeof(VALUE) - 1;
1180         s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
1181         t = (const VALUE*)(~lowbits & (VALUE)e);
1182         while (p < (const char *)s) {
1183             if (is_utf8_lead_byte(*p)) nth--;
1184             p++;
1185         }
1186         do {
1187             nth -= count_utf8_lead_bytes_with_word(s);
1188             s++;
1189         } while (s < t && sizeof(VALUE) <= nth);
1190         p = (char *)s;
1191     }
1192     while (p < e) {
1193         if (is_utf8_lead_byte(*p)) {
1194             if (nth == 0) break;
1195             nth--;
1196         }
1197         p++;
1198     }
1199     return (char *)p;
1200 }
1201
1202 static int
1203 str_utf8_offset(const char *p, const char *e, int nth)
1204 {
1205     const char *pp = str_utf8_nth(p, e, nth);
1206     if (!pp) return e - p;
1207     return pp - p;
1208 }
1209 #endif
1210
1211 /* byte offset to char offset */
1212 long
1213 rb_str_sublen(VALUE str, long pos)
1214 {
1215     if (single_byte_optimizable(str) || pos < 0)
1216         return pos;
1217     else {
1218         char *p = RSTRING_PTR(str);
1219         return rb_enc_strlen(p, p + pos, STR_ENC_GET(str));
1220     }
1221 }
1222
1223 VALUE
1224 rb_str_subseq(VALUE str, long beg, long len)
1225 {
1226     VALUE str2 = rb_str_new5(str, RSTRING_PTR(str)+beg, len);
1227
1228     rb_enc_cr_str_copy_for_substr(str2, str);
1229     OBJ_INFECT(str2, str);
1230
1231     return str2;
1232 }
1233
1234 VALUE
1235 rb_str_substr(VALUE str, long beg, long len)
1236 {
1237     rb_encoding *enc = STR_ENC_GET(str);
1238     VALUE str2;
1239     char *p, *s = RSTRING_PTR(str), *e = s + RSTRING_LEN(str);
1240     int singlebyte = single_byte_optimizable(str);
1241
1242     if (len < 0) return Qnil;
1243     if (!RSTRING_LEN(str)) {
1244         len = 0;
1245     }
1246     if (beg < 0) {
1247         if (len > -beg) len = -beg;
1248         if (-beg * rb_enc_mbmaxlen(enc) < RSTRING_LEN(str) / 8) {
1249             beg = -beg;
1250             while (beg-- > len && (e = rb_enc_prev_char(s, e, enc)) != 0);
1251             p = e;
1252             if (!p) return Qnil;
1253             while (len-- > 0 && (p = rb_enc_prev_char(s, p, enc)) != 0);
1254             if (!p) return Qnil;
1255             len = e - p;
1256             goto sub;
1257         }
1258         else {
1259             beg += str_strlen(str, enc);
1260             if (beg < 0) return Qnil;
1261         }
1262     }
1263     else if (beg > 0 && beg > str_strlen(str, enc)) {
1264         return Qnil;
1265     }
1266     if (len == 0) {
1267         p = 0;
1268     }
1269 #ifdef NONASCII_MASK
1270     else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
1271         enc == rb_utf8_encoding()) {
1272         p = str_utf8_nth(s, e, beg);
1273         len = str_utf8_offset(p, e, len);
1274     }
1275 #endif
1276     else if ((p = str_nth(s, e, beg, enc, singlebyte)) == e) {
1277         len = 0;
1278     }
1279     else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
1280         if (len * rb_enc_mbmaxlen(enc) > e - p)
1281             len = e - p;
1282         else
1283             len *= rb_enc_mbmaxlen(enc);
1284     }
1285     else {
1286         len = str_offset(p, e, len, enc, singlebyte);
1287     }
1288   sub:
1289     if (len > RSTRING_EMBED_LEN_MAX && beg + len == RSTRING_LEN(str)) {
1290         str2 = rb_str_new4(str);
1291         str2 = str_new3(rb_obj_class(str2), str2);
1292         RSTRING(str2)->as.heap.ptr += RSTRING(str2)->as.heap.len - len;
1293         RSTRING(str2)->as.heap.len = len;
1294     }
1295     else {
1296         str2 = rb_str_new5(str, p, len);
1297         rb_enc_cr_str_copy_for_substr(str2, str);
1298         OBJ_INFECT(str2, str);
1299     }
1300
1301     return str2;
1302 }
1303
1304 VALUE
1305 rb_str_freeze(VALUE str)
1306 {
1307     if (STR_ASSOC_P(str)) {
1308         VALUE ary = RSTRING(str)->as.heap.aux.shared;
1309         OBJ_FREEZE(ary);
1310     }
1311     return rb_obj_freeze(str);
1312 }
1313
1314 VALUE
1315 rb_str_dup_frozen(VALUE str)
1316 {
1317     if (STR_SHARED_P(str) && RSTRING(str)->as.heap.aux.shared) {
1318         VALUE shared = RSTRING(str)->as.heap.aux.shared;
1319         if (RSTRING_LEN(shared) == RSTRING_LEN(str)) {
1320             OBJ_FREEZE(shared);
1321             return shared;
1322         }
1323     }
1324     if (OBJ_FROZEN(str)) return str;
1325     str = rb_str_dup(str);
1326     OBJ_FREEZE(str);
1327     return str;
1328 }
1329
1330 VALUE
1331 rb_str_locktmp(VALUE str)
1332 {
1333     if (FL_TEST(str, STR_TMPLOCK)) {
1334         rb_raise(rb_eRuntimeError, "temporal locking already locked string");
1335     }
1336     FL_SET(str, STR_TMPLOCK);
1337     return str;
1338 }
1339
1340 VALUE
1341 rb_str_unlocktmp(VALUE str)
1342 {
1343     if (!FL_TEST(str, STR_TMPLOCK)) {
1344         rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
1345     }
1346     FL_UNSET(str, STR_TMPLOCK);
1347     return str;
1348 }
1349
1350 void
1351 rb_str_set_len(VALUE str, long len)
1352 {
1353     STR_SET_LEN(str, len);
1354     RSTRING_PTR(str)[len] = '\0';
1355 }
1356
1357 VALUE
1358 rb_str_resize(VALUE str, long len)
1359 {
1360     long slen;
1361
1362     if (len < 0) {
1363         rb_raise(rb_eArgError, "negative string size (or size too big)");
1364     }
1365
1366     rb_str_modify(str);
1367     slen = RSTRING_LEN(str);
1368     if (len != slen) {
1369         if (STR_EMBED_P(str)) {
1370             char *ptr;
1371             if (len <= RSTRING_EMBED_LEN_MAX) {
1372                 STR_SET_EMBED_LEN(str, len);
1373                 RSTRING(str)->as.ary[len] = '\0';
1374                 return str;
1375             }
1376             ptr = ALLOC_N(char,len+1);
1377             MEMCPY(ptr, RSTRING(str)->as.ary, char, slen);
1378             RSTRING(str)->as.heap.ptr = ptr;
1379             STR_SET_NOEMBED(str);
1380         }
1381         else if (len <= RSTRING_EMBED_LEN_MAX) {
1382             char *ptr = RSTRING(str)->as.heap.ptr;
1383             STR_SET_EMBED(str);
1384             if (slen > 0) MEMCPY(RSTRING(str)->as.ary, ptr, char, len);
1385             RSTRING(str)->as.ary[len] = '\0';
1386             STR_SET_EMBED_LEN(str, len);
1387             xfree(ptr);
1388             return str;
1389         }
1390         else if (slen < len || slen - len > 1024) {
1391             REALLOC_N(RSTRING(str)->as.heap.ptr, char, len+1);
1392         }
1393         if (!STR_NOCAPA_P(str)) {
1394             RSTRING(str)->as.heap.aux.capa = len;
1395         }
1396         RSTRING(str)->as.heap.len = len;
1397         RSTRING(str)->as.heap.ptr[len] = '\0';  /* sentinel */
1398     }
1399     return str;
1400 }
1401
1402 VALUE
1403 rb_str_buf_cat(VALUE str, const char *ptr, long len)
1404 {
1405     long capa, total;
1406
1407     if (len == 0) return str;
1408     if (len < 0) {
1409         rb_raise(rb_eArgError, "negative string size (or size too big)");
1410     }
1411     rb_str_modify(str);
1412     if (STR_ASSOC_P(str)) {
1413         FL_UNSET(str, STR_ASSOC);
1414         capa = RSTRING(str)->as.heap.aux.capa = RSTRING_LEN(str);
1415     }
1416     else if (STR_EMBED_P(str)) {
1417         capa = RSTRING_EMBED_LEN_MAX;
1418     }
1419     else {
1420         capa = RSTRING(str)->as.heap.aux.capa;
1421     }
1422     total = RSTRING_LEN(str)+len;
1423     if (capa <= total) {
1424         while (total > capa) {
1425             capa = (capa + 1) * 2;
1426         }
1427         RESIZE_CAPA(str, capa);
1428     }
1429     memcpy(RSTRING_PTR(str) + RSTRING_LEN(str), ptr, len);
1430     STR_SET_LEN(str, total);
1431     RSTRING_PTR(str)[total] = '\0'; /* sentinel */
1432
1433     return str;
1434 }
1435
1436 VALUE
1437 rb_str_buf_cat2(VALUE str, const char *ptr)
1438 {
1439     return rb_str_buf_cat(str, ptr, strlen(ptr));
1440 }
1441
1442 VALUE
1443 rb_str_cat(VALUE str, const char *ptr, long len)
1444 {
1445     if (len < 0) {
1446         rb_raise(rb_eArgError, "negative string size (or size too big)");
1447     }
1448     if (STR_ASSOC_P(str)) {
1449         rb_str_modify(str);
1450         if (STR_EMBED_P(str)) str_make_independent(str);
1451         REALLOC_N(RSTRING(str)->as.heap.ptr, char, RSTRING(str)->as.heap.len+len);
1452         memcpy(RSTRING(str)->as.heap.ptr + RSTRING(str)->as.heap.len, ptr, len);
1453         RSTRING(str)->as.heap.len += len;
1454         RSTRING(str)->as.heap.ptr[RSTRING(str)->as.heap.len] = '\0'; /* sentinel */
1455         return str;
1456     }
1457
1458     return rb_str_buf_cat(str, ptr, len);
1459 }
1460
1461 VALUE
1462 rb_str_cat2(VALUE str, const char *ptr)
1463 {
1464     return rb_str_cat(str, ptr, strlen(ptr));
1465 }
1466
1467 static VALUE
1468 rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
1469     int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
1470 {
1471     long capa, total, off = -1;
1472
1473     int str_encindex = ENCODING_GET(str);
1474     int res_encindex;
1475     int str_cr, res_cr;
1476     int str_a8 = ENCODING_IS_ASCII8BIT(str);
1477     int ptr_a8 = ptr_encindex == 0;
1478
1479     str_cr = ENC_CODERANGE(str);
1480
1481     if (str_encindex == ptr_encindex) {
1482         if (str_cr == ENC_CODERANGE_UNKNOWN ||
1483             (ptr_a8 && str_cr != ENC_CODERANGE_7BIT)) {
1484             ptr_cr = ENC_CODERANGE_UNKNOWN;
1485         }
1486         else if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
1487             ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
1488         }
1489     }
1490     else {
1491         rb_encoding *str_enc = rb_enc_from_index(str_encindex);
1492         rb_encoding *ptr_enc = rb_enc_from_index(ptr_encindex);
1493         if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
1494             if (len == 0)
1495                 return str;
1496             if (RSTRING_LEN(str) == 0) {
1497                 rb_str_buf_cat(str, ptr, len);
1498                 ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
1499                 return str;
1500             }
1501             goto incompatible;
1502         }
1503         if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
1504             ptr_cr = coderange_scan(ptr, len, ptr_enc);
1505         }
1506         if (str_cr == ENC_CODERANGE_UNKNOWN) {
1507             if (str_a8 || ptr_cr != ENC_CODERANGE_7BIT) {
1508                 str_cr = rb_enc_str_coderange(str);
1509             }
1510         }
1511     }
1512     if (ptr_cr_ret)
1513         *ptr_cr_ret = ptr_cr;
1514
1515     if (str_encindex != ptr_encindex &&
1516         str_cr != ENC_CODERANGE_7BIT &&
1517         ptr_cr != ENC_CODERANGE_7BIT) {
1518       incompatible:
1519         rb_raise(rb_eArgError, "append incompatible encoding strings: %s and %s",
1520             rb_enc_name(rb_enc_from_index(str_encindex)),
1521             rb_enc_name(rb_enc_from_index(ptr_encindex)));
1522     }
1523
1524     if (str_cr == ENC_CODERANGE_UNKNOWN) {
1525         res_encindex = str_encindex;
1526         res_cr = ENC_CODERANGE_UNKNOWN;
1527     }
1528     else if (str_cr == ENC_CODERANGE_7BIT) {
1529         if (ptr_cr == ENC_CODERANGE_7BIT) {
1530             res_encindex = !str_a8 ? str_encindex : ptr_encindex;
1531             res_cr = ENC_CODERANGE_7BIT;
1532         }
1533         else {
1534             res_encindex = ptr_encindex;
1535             res_cr = ptr_cr;
1536         }
1537     }
1538     else if (str_cr == ENC_CODERANGE_VALID) {
1539         res_encindex = str_encindex;
1540         res_cr = str_cr;
1541     }
1542     else { /* str_cr == ENC_CODERANGE_BROKEN */
1543         res_encindex = str_encindex;
1544         res_cr = str_cr;
1545         if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
1546     }
1547
1548     if (len < 0) {
1549         rb_raise(rb_eArgError, "negative string size (or size too big)");
1550     }
1551     if (ptr >= RSTRING_PTR(str) && ptr <= RSTRING_END(str)) {
1552         off = ptr - RSTRING_PTR(str);
1553     }
1554     rb_str_modify(str);
1555     if (len == 0) {
1556         ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
1557         return str;
1558     }
1559     if (STR_ASSOC_P(str)) {
1560         FL_UNSET(str, STR_ASSOC);
1561         capa = RSTRING(str)->as.heap.aux.capa = RSTRING_LEN(str);
1562     }
1563     else if (STR_EMBED_P(str)) {
1564         capa = RSTRING_EMBED_LEN_MAX;
1565     }
1566     else {
1567         capa = RSTRING(str)->as.heap.aux.capa;
1568     }
1569     total = RSTRING_LEN(str)+len;
1570     if (capa <= total) {
1571         while (total > capa) {
1572             capa = (capa + 1) * 2;
1573         }
1574         RESIZE_CAPA(str, capa);
1575     }
1576     if (off != -1) {
1577         ptr = RSTRING_PTR(str) + off;
1578     }
1579     memcpy(RSTRING_PTR(str) + RSTRING_LEN(str), ptr, len);
1580     STR_SET_LEN(str, total);
1581     RSTRING_PTR(str)[total] = '\0'; /* sentinel */
1582
1583     ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
1584     return str;
1585 }
1586
1587 VALUE
1588 rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
1589 {
1590     return rb_enc_cr_str_buf_cat(str, ptr, len,
1591         rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL);
1592 }
1593
1594 VALUE
1595 rb_str_buf_cat_ascii(VALUE str, const char *ptr)
1596 {
1597     /* ptr must reference NUL terminated ASCII string. */
1598     int encindex = ENCODING_GET(str);
1599     rb_encoding *enc = rb_enc_from_index(encindex);
1600     if (rb_enc_asciicompat(enc)) {
1601         return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
1602             encindex, ENC_CODERANGE_7BIT, 0);
1603     }
1604     else {
1605         char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
1606         while (*ptr) {
1607             int c = (unsigned char)*ptr;
1608             int len = rb_enc_codelen(c, enc);
1609             rb_enc_mbcput(c, buf, enc);
1610             rb_enc_cr_str_buf_cat(str, buf, len,
1611                 encindex, ENC_CODERANGE_VALID, 0);
1612             ptr++;
1613         }
1614         return str;
1615     }
1616 }
1617
1618 VALUE
1619 rb_str_buf_append(VALUE str, VALUE str2)
1620 {
1621     int str2_cr;
1622
1623     str2_cr = ENC_CODERANGE(str2);
1624
1625     rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
1626         ENCODING_GET(str2), str2_cr, &str2_cr);
1627
1628     OBJ_INFECT(str, str2);
1629     ENC_CODERANGE_SET(str2, str2_cr);
1630
1631     return str;
1632 }
1633
1634 VALUE
1635 rb_str_append(VALUE str, VALUE str2)
1636 {
1637     rb_encoding *enc;
1638     int cr, cr2;
1639
1640     StringValue(str2);
1641     if (RSTRING_LEN(str2) > 0 && STR_ASSOC_P(str)) {
1642         long len = RSTRING_LEN(str)+RSTRING_LEN(str2);
1643         enc = rb_enc_check(str, str2);
1644         cr = ENC_CODERANGE(str);
1645         if ((cr2 = ENC_CODERANGE(str2)) > cr) cr = cr2;
1646         rb_str_modify(str);
1647         REALLOC_N(RSTRING(str)->as.heap.ptr, char, len+1);
1648         memcpy(RSTRING(str)->as.heap.ptr + RSTRING(str)->as.heap.len,
1649                RSTRING_PTR(str2), RSTRING_LEN(str2)+1);
1650         RSTRING(str)->as.heap.len = len;
1651         rb_enc_associate(str, enc);
1652         ENC_CODERANGE_SET(str, cr);
1653         OBJ_INFECT(str, str2);
1654         return str;
1655     }
1656     return rb_str_buf_append(str, str2);
1657 }
1658
1659
1660 /*
1661  *  call-seq:
1662  *     str << fixnum        => str
1663  *     str.concat(fixnum)   => str
1664  *     str << obj           => str
1665  *     str.concat(obj)      => str
1666  *
1667  *  Append---Concatenates the given object to <i>str</i>. If the object is a
1668  *  <code>Fixnum</code>, it is considered as a codepoint, and is converted
1669  *  to a character before concatenation.
1670  *
1671  *     a = "hello "
1672  *     a << "world"   #=> "hello world"
1673  *     a.concat(33)   #=> "hello world!"
1674  */
1675
1676 VALUE
1677 rb_str_concat(VALUE str1, VALUE str2)
1678 {
1679     if (FIXNUM_P(str2)) {
1680         rb_encoding *enc = STR_ENC_GET(str1);
1681         int c = FIX2INT(str2);
1682         int pos = RSTRING_LEN(str1);
1683         int len = rb_enc_codelen(c, enc);
1684         int cr = ENC_CODERANGE(str1);
1685
1686         rb_str_resize(str1, pos+len);
1687         rb_enc_mbcput(c, RSTRING_PTR(str1)+pos, enc);
1688         ENC_CODERANGE_SET(str1, cr);
1689         return str1;
1690     }
1691     return rb_str_append(str1, str2);
1692 }
1693
1694 #if defined __i386__ || defined _M_IX86
1695 #define UNALIGNED_WORD_ACCESS 1
1696 #endif
1697 #ifndef UNALIGNED_WORD_ACCESS
1698 #define UNALIGNED_WORD_ACCESS 0
1699 #endif
1700
1701 /* MurmurHash described in http://murmurhash.googlepages.com/ */
1702 unsigned int
1703 hash(const unsigned char * data, int len, unsigned int h)
1704 {
1705     const unsigned int m = 0x7fd652ad;
1706     const int r = 16;
1707
1708     h += 0xdeadbeef;
1709
1710     if (len >= 4) {
1711 #if !UNALIGNED_WORD_ACCESS
1712         int align = (VALUE)data & 3;
1713         if (align) {
1714             uint32_t t = 0, d = 0;
1715             int sl, sr, pack;
1716
1717             switch (align) {
1718 #ifdef WORDS_BIGENDIAN
1719               case 1: t |= data[2];
1720               case 2: t |= data[1] << 8;
1721               case 3: t |= data[0] << 16;
1722 #else
1723               case 1: t |= data[2] << 16;
1724               case 2: t |= data[1] << 8;
1725               case 3: t |= data[0];
1726 #endif
1727             }
1728
1729 #ifdef WORDS_BIGENDIAN
1730             t >>= (8 * align) - 8;
1731 #else
1732             t <<= (8 * align);
1733 #endif
1734
1735             data += 4-align;
1736             len -= 4-align;
1737
1738             sl = 8 * (4-align);
1739             sr = 8 * align;
1740
1741             while (len >= 4) {
1742                 d = *(uint32_t *)data;
1743 #ifdef WORDS_BIGENDIAN
1744                 t = (t << sr) | (d >> sl);
1745 #else
1746                 t = (t >> sr) | (d << sl);
1747 #endif
1748                 h += t;
1749                 h *= m;
1750                 h ^= h >> r;
1751                 t = d;
1752
1753                 data += 4;
1754                 len -= 4;
1755             }
1756
1757             pack = len < align ? len : align;
1758             d = 0;
1759             switch (pack) {
1760 #ifdef WORDS_BIGENDIAN
1761               case 3: d |= data[2] << 8;
1762               case 2: d |= data[1] << 16;
1763               case 1: d |= data[0] << 24;
1764               case 0:
1765                 h += (t << sr) | (d >> sl);
1766 #else
1767               case 3: d |= data[2] << 16;
1768               case 2: d |= data[1] << 8;
1769               case 1: d |= data[0];
1770               case 0:
1771                 h += (t >> sr) | (d << sl);
1772 #endif
1773                 h *= m;
1774                 h ^= h >> r;
1775             }
1776
1777             data += pack;
1778             len -= pack;
1779         }
1780         else
1781 #endif
1782         {
1783             do {
1784                 h += *(uint32_t *)data;
1785                 h *= m;
1786                 h ^= h >> r;
1787
1788                 data += 4;
1789                 len -= 4;
1790             } while (len >= 4);
1791         }
1792     }
1793
1794     switch(len) {
1795 #ifdef WORDS_BIGENDIAN
1796       case 3:
1797         h += data[2] << 8;
1798       case 2:
1799         h += data[1] << 16;
1800       case 1:
1801         h += data[0] << 24;
1802 #else
1803       case 3:
1804         h += data[2] << 16;
1805       case 2:
1806         h += data[1] << 8;
1807       case 1:
1808         h += data[0];
1809 #endif
1810         h *= m;
1811         h ^= h >> r;
1812     }
1813
1814     h *= m;
1815     h ^= h >> 10;
1816     h *= m;
1817     h ^= h >> 17;
1818
1819     return h;
1820 }
1821
1822 int
1823 rb_memhash(const void *ptr, long len)
1824 {
1825     return hash(ptr, len, 0);
1826 }
1827
1828 int
1829 rb_str_hash(VALUE str)
1830 {
1831     return hash((const void *)RSTRING_PTR(str), RSTRING_LEN(str), 0);
1832 }
1833
1834 int
1835 rb_str_hash_cmp(VALUE str1, VALUE str2)
1836 {
1837     int len;
1838
1839     if (!rb_str_comparable(str1, str2)) return 1;
1840     if (RSTRING_LEN(str1) == (len = RSTRING_LEN(str2)) &&
1841         memcmp(RSTRING_PTR(str1), RSTRING_PTR(str2), len) == 0) {
1842         return 0;
1843     }
1844     return 1;
1845 }
1846
1847 /*
1848  * call-seq:
1849  *    str.hash   => fixnum
1850  *
1851  * Return a hash based on the string's length and content.
1852  */
1853
1854 static VALUE
1855 rb_str_hash_m(VALUE str)
1856 {
1857     int hval = rb_str_hash(str);
1858     return INT2FIX(hval);
1859 }
1860
1861 #define lesser(a,b) (((a)>(b))?(b):(a))
1862
1863 int
1864 rb_str_comparable(VALUE str1, VALUE str2)
1865 {
1866     int idx1, idx2;
1867     int rc1, rc2;
1868
1869     if (RSTRING_LEN(str1) == 0) return Qtrue;
1870     if (RSTRING_LEN(str2) == 0) return Qtrue;
1871     idx1 = ENCODING_GET(str1);
1872     idx2 = ENCODING_GET(str2);
1873     if (idx1 == idx2) return Qtrue;
1874     rc1 = rb_enc_str_coderange(str1);
1875     rc2 = rb_enc_str_coderange(str2);
1876     if (rc1 == ENC_CODERANGE_7BIT) {
1877         if (rc2 == ENC_CODERANGE_7BIT) return Qtrue;
1878         if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
1879             return Qtrue;
1880     }
1881     if (rc2 == ENC_CODERANGE_7BIT) {
1882         if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
1883             return Qtrue;
1884     }
1885     return Qfalse;
1886 }
1887
1888 int
1889 rb_str_cmp(VALUE str1, VALUE str2)
1890 {
1891     long len;
1892     int retval;
1893
1894     len = lesser(RSTRING_LEN(str1), RSTRING_LEN(str2));
1895     retval = memcmp(RSTRING_PTR(str1), RSTRING_PTR(str2), len);
1896     if (retval == 0) {
1897         if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) {
1898             if (!rb_enc_compatible(str1, str2)) {
1899                 if (ENCODING_GET(str1) - ENCODING_GET(str2) > 0)
1900                     return 1;
1901                 return -1;
1902             }
1903             return 0;
1904         }
1905         if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return 1;
1906         return -1;
1907     }
1908     if (retval > 0) return 1;
1909     return -1;
1910 }
1911
1912
1913 /*
1914  *  call-seq:
1915  *     str == obj   => true or false
1916  *
1917  *  Equality---If <i>obj</i> is not a <code>String</code>, returns
1918  *  <code>false</code>. Otherwise, returns <code>true</code> if <i>str</i>
1919  *  <code><=></code> <i>obj</i> returns zero.
1920  */
1921
1922 VALUE
1923 rb_str_equal(VALUE str1, VALUE str2)
1924 {
1925     int len;
1926
1927     if (str1 == str2) return Qtrue;
1928     if (TYPE(str2) != T_STRING) {
1929         if (!rb_respond_to(str2, rb_intern("to_str"))) {
1930             return Qfalse;
1931         }
1932         return rb_equal(str2, str1);
1933     }
1934     if (!rb_str_comparable(str1, str2)) return Qfalse;
1935     if (RSTRING_LEN(str1) == (len = RSTRING_LEN(str2)) &&
1936         memcmp(RSTRING_PTR(str1), RSTRING_PTR(str2), len) == 0) {
1937         return Qtrue;
1938     }
1939     return Qfalse;
1940 }
1941
1942 /*
1943  * call-seq:
1944  *   str.eql?(other)   => true or false
1945  *
1946  * Two strings are equal if the have the same length and content.
1947  */
1948
1949 static VALUE
1950 rb_str_eql(VALUE str1, VALUE str2)
1951 {
1952     if (TYPE(str2) != T_STRING || RSTRING_LEN(str1) != RSTRING_LEN(str2))
1953         return Qfalse;
1954
1955     if (!rb_str_comparable(str1, str2)) return Qfalse;
1956     if (memcmp(RSTRING_PTR(str1), RSTRING_PTR(str2),
1957                lesser(RSTRING_LEN(str1), RSTRING_LEN(str2))) == 0)
1958         return Qtrue;
1959
1960     return Qfalse;
1961 }
1962
1963 /*
1964  *  call-seq:
1965  *     str <=> other_str   => -1, 0, +1
1966  *
1967  *  Comparison---Returns -1 if <i>other_str</i> is less than, 0 if
1968  *  <i>other_str</i> is equal to, and +1 if <i>other_str</i> is greater than
1969  *  <i>str</i>. If the strings are of different lengths, and the strings are
1970  *  equal when compared up to the shortest length, then the longer string is
1971  *  considered greater than the shorter one. In older versions of Ruby, setting
1972  *  <code>$=</code> allowed case-insensitive comparisons; this is now deprecated
1973  *  in favor of using <code>String#casecmp</code>.
1974  *
1975  *  <code><=></code> is the basis for the methods <code><</code>,
1976  *  <code><=</code>, <code>></code>, <code>>=</code>, and <code>between?</code>,
1977  *  included from module <code>Comparable</code>.  The method
1978  *  <code>String#==</code> does not use <code>Comparable#==</code>.
1979  *
1980  *     "abcdef" <=> "abcde"     #=> 1
1981  *     "abcdef" <=> "abcdef"    #=> 0
1982  *     "abcdef" <=> "abcdefg"   #=> -1
1983  *     "abcdef" <=> "ABCDEF"    #=> 1
1984  */
1985
1986 static VALUE
1987 rb_str_cmp_m(VALUE str1, VALUE str2)
1988 {
1989     long result;
1990
1991     if (TYPE(str2) != T_STRING) {
1992         if (!rb_respond_to(str2, rb_intern("to_str"))) {
1993             return Qnil;
1994         }
1995         else if (!rb_respond_to(str2, rb_intern("<=>"))) {
1996             return Qnil;
1997         }
1998         else {
1999             VALUE tmp = rb_funcall(str2, rb_intern("<=>"), 1, str1);
2000
2001             if (NIL_P(tmp)) return Qnil;
2002             if (!FIXNUM_P(tmp)) {
2003                 return rb_funcall(LONG2FIX(0), '-', 1, tmp);
2004             }
2005             result = -FIX2LONG(tmp);
2006         }
2007     }
2008     else {
2009         result = rb_str_cmp(str1, str2);
2010     }
2011     return LONG2NUM(result);
2012 }
2013
2014 /*
2015  *  call-seq:
2016  *     str.casecmp(other_str)   => -1, 0, +1
2017  *
2018  *  Case-insensitive version of <code>String#<=></code>.
2019  *
2020  *     "abcdef".casecmp("abcde")     #=> 1
2021  *     "aBcDeF".casecmp("abcdef")    #=> 0
2022  *     "abcdef".casecmp("abcdefg")   #=> -1
2023  *     "abcdef".casecmp("ABCDEF")    #=> 0
2024  */
2025
2026 static VALUE
2027 rb_str_casecmp(VALUE str1, VALUE str2)
2028 {
2029     long len;
2030     rb_encoding *enc;
2031     char *p1, *p1end, *p2, *p2end;
2032
2033     StringValue(str2);
2034     enc = rb_enc_compatible(str1, str2);
2035     if (!enc) {
2036         return Qnil;
2037     }
2038
2039     p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
2040     p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
2041     while (p1 < p1end && p2 < p2end) {
2042         int c1 = rb_enc_codepoint(p1, p1end, enc);
2043         int c2 = rb_enc_codepoint(p2, p2end, enc);
2044
2045         if (c1 != c2) {
2046             c1 = rb_enc_toupper(c1, enc);
2047             c2 = rb_enc_toupper(c2, enc);
2048             if (c1 > c2) return INT2FIX(1);
2049             if (c1 < c2) return INT2FIX(-1);
2050         }
2051         len = rb_enc_codelen(c1, enc);
2052         p1 += len;
2053         p2 += len;
2054     }
2055     if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0);
2056     if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1);
2057     return INT2FIX(-1);
2058 }
2059
2060 static long
2061 rb_str_index(VALUE str, VALUE sub, long offset)
2062 {
2063     long pos;
2064     char *s, *sptr;
2065     long len, slen;
2066     rb_encoding *enc;
2067
2068     enc = rb_enc_check(str, sub);
2069     if (is_broken_string(sub)) {
2070         return -1;
2071     }
2072     len = str_strlen(str, enc);
2073     slen = str_strlen(sub, enc);
2074     if (offset < 0) {
2075         offset += len;
2076         if (offset < 0) return -1;
2077     }
2078     if (len - offset < slen) return -1;
2079     s = RSTRING_PTR(str);
2080     if (offset) {
2081         offset = str_offset(s, RSTRING_END(str), offset, enc, single_byte_optimizable(str));
2082         s += offset;
2083     }
2084     if (slen == 0) return offset;
2085     /* need proceed one character at a time */
2086     sptr = RSTRING_PTR(sub);
2087     slen = RSTRING_LEN(sub);
2088     len = RSTRING_LEN(str) - offset;
2089     for (;;) {
2090         char *t;
2091         pos = rb_memsearch(sptr, slen, s, len, enc);
2092         if (pos < 0) return pos;
2093         t = rb_enc_right_char_head(s, s+pos, enc);
2094         if (t == s + pos) break;
2095         if ((len -= t - s) <= 0) return -1;
2096         offset += t - s;
2097         s = t;
2098     }
2099     return pos + offset;
2100 }
2101
2102
2103 /*
2104  *  call-seq:
2105  *     str.index(substring [, offset])   => fixnum or nil
2106  *     str.index(fixnum [, offset])      => fixnum or nil
2107  *     str.index(regexp [, offset])      => fixnum or nil
2108  *
2109  *  Returns the index of the first occurrence of the given <i>substring</i>,
2110  *  character (<i>fixnum</i>), or pattern (<i>regexp</i>) in <i>str</i>. Returns
2111  *  <code>nil</code> if not found. If the second parameter is present, it
2112  *  specifies the position in the string to begin the search.
2113  *
2114  *     "hello".index('e')             #=> 1
2115  *     "hello".index('lo')            #=> 3
2116  *     "hello".index('a')             #=> nil
2117  *     "hello".index(?e)              #=> 1
2118  *     "hello".index(101)             #=> 1
2119  *     "hello".index(/[aeiou]/, -3)   #=> 4
2120  */
2121
2122 static VALUE
2123 rb_str_index_m(int argc, VALUE *argv, VALUE str)
2124 {
2125     VALUE sub;
2126     VALUE initpos;
2127     long pos;
2128
2129     if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
2130         pos = NUM2LONG(initpos);
2131     }
2132     else {
2133         pos = 0;
2134     }
2135     if (pos < 0) {
2136         pos += str_strlen(str, STR_ENC_GET(str));
2137         if (pos < 0) {
2138             if (TYPE(sub) == T_REGEXP) {
2139                 rb_backref_set(Qnil);
2140             }
2141             return Qnil;
2142         }
2143     }
2144
2145     switch (TYPE(sub)) {
2146       case T_REGEXP:
2147         pos = rb_reg_adjust_startpos(sub, str, pos, 0);
2148         pos = rb_reg_search(sub, str, pos, 0);
2149         pos = rb_str_sublen(str, pos);
2150         break;
2151
2152       default: {
2153         VALUE tmp;
2154
2155         tmp = rb_check_string_type(sub);
2156         if (NIL_P(tmp)) {
2157             rb_raise(rb_eTypeError, "type mismatch: %s given",
2158                      rb_obj_classname(sub));
2159         }
2160         sub = tmp;
2161       }
2162         /* fall through */
2163       case T_STRING:
2164         pos = rb_str_index(str, sub, pos);
2165         pos = rb_str_sublen(str, pos);
2166         break;
2167     }
2168
2169     if (pos == -1) return Qnil;
2170     return LONG2NUM(pos);
2171 }
2172
2173 static long
2174 rb_str_rindex(VALUE str, VALUE sub, long pos)
2175 {
2176     long len, slen;
2177     char *s, *sbeg, *e, *t;
2178     rb_encoding *enc;
2179     int singlebyte = single_byte_optimizable(str);
2180
2181     enc = rb_enc_check(str, sub);
2182     if (is_broken_string(sub)) {
2183         return -1;
2184     }
2185     len = str_strlen(str, enc);
2186     slen = str_strlen(sub, enc);
2187     /* substring longer than string */
2188     if (len < slen) return -1;
2189     if (len - pos < slen) {
2190         pos = len - slen;
2191     }
2192     if (len == 0) {
2193         return pos;
2194     }
2195     sbeg = RSTRING_PTR(str);
2196     e = RSTRING_END(str);
2197     t = RSTRING_PTR(sub);
2198     slen = RSTRING_LEN(sub);
2199     for (;;) {
2200         s = str_nth(sbeg, e, pos, enc, singlebyte);
2201         if (!s) return -1;
2202         if (memcmp(s, t, slen) == 0) {
2203             return pos;
2204         }
2205         if (pos == 0) break;
2206         pos--;
2207     }
2208     return -1;
2209 }
2210
2211
2212 /*
2213  *  call-seq:
2214  *     str.rindex(substring [, fixnum])   => fixnum or nil
2215  *     str.rindex(fixnum [, fixnum])   => fixnum or nil
2216  *     str.rindex(regexp [, fixnum])   => fixnum or nil
2217  *
2218  *  Returns the index of the last occurrence of the given <i>substring</i>,
2219  *  character (<i>fixnum</i>), or pattern (<i>regexp</i>) in <i>str</i>. Returns
2220  *  <code>nil</code> if not found. If the second parameter is present, it
2221  *  specifies the position in the string to end the search---characters beyond
2222  *  this point will not be considered.
2223  *
2224  *     "hello".rindex('e')             #=> 1
2225  *     "hello".rindex('l')             #=> 3
2226  *     "hello".rindex('a')             #=> nil
2227  *     "hello".rindex(?e)              #=> 1
2228  *     "hello".rindex(101)             #=> 1
2229  *     "hello".rindex(/[aeiou]/, -2)   #=> 1
2230  */
2231
2232 static VALUE
2233 rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
2234 {
2235     VALUE sub;
2236     VALUE vpos;
2237     rb_encoding *enc = STR_ENC_GET(str);
2238     long pos, len = str_strlen(str, enc);
2239
2240     if (rb_scan_args(argc, argv, "11", &sub, &vpos) == 2) {
2241         pos = NUM2LONG(vpos);
2242         if (pos < 0) {
2243             pos += len;
2244             if (pos < 0) {
2245                 if (TYPE(sub) == T_REGEXP) {
2246                     rb_backref_set(Qnil);
2247                 }
2248                 return Qnil;
2249             }
2250         }
2251         if (pos > len) pos = len;
2252     }
2253     else {
2254         pos = len;
2255     }
2256
2257     switch (TYPE(sub)) {
2258       case T_REGEXP:
2259         /* enc = rb_get_check(str, sub); */
2260         if (RREGEXP(sub)->len) {
2261             pos = rb_reg_adjust_startpos(sub, str, pos, 1);
2262             pos = rb_reg_search(sub, str, pos, 1);
2263             pos = rb_str_sublen(str, pos);
2264         }
2265         if (pos >= 0) return LONG2NUM(pos);
2266         break;
2267
2268       default: {
2269         VALUE tmp;
2270
2271         tmp = rb_check_string_type(sub);
2272         if (NIL_P(tmp)) {
2273             rb_raise(rb_eTypeError, "type mismatch: %s given",
2274                      rb_obj_classname(sub));
2275         }
2276         sub = tmp;
2277       }
2278         /* fall through */
2279       case T_STRING:
2280         pos = rb_str_rindex(str, sub, pos);
2281         if (pos >= 0) return LONG2NUM(pos);
2282         break;
2283     }
2284     return Qnil;
2285 }
2286
2287 /*
2288  *  call-seq:
2289  *     str =~ obj   => fixnum or nil
2290  *
2291  *  Match---If <i>obj</i> is a <code>Regexp</code>, use it as a pattern to match
2292  *  against <i>str</i>,and returns the position the match starts, or
2293  *  <code>nil</code> if there is no match. Otherwise, invokes
2294  *  <i>obj.=~</i>, passing <i>str</i> as an argument. The default
2295  *  <code>=~</code> in <code>Object</code> returns <code>false</code>.
2296  *
2297  *     "cat o' 9 tails" =~ /\d/   #=> 7
2298  *     "cat o' 9 tails" =~ 9      #=> nil
2299  */
2300
2301 static VALUE
2302 rb_str_match(VALUE x, VALUE y)
2303 {
2304     switch (TYPE(y)) {
2305       case T_STRING:
2306         rb_raise(rb_eTypeError, "type mismatch: String given");
2307
2308       case T_REGEXP:
2309         return rb_reg_match(y, x);
2310
2311       default:
2312         return rb_funcall(y, rb_intern("=~"), 1, x);
2313     }
2314 }
2315
2316
2317 static VALUE get_pat(VALUE, int);
2318
2319
2320 /*
2321  *  call-seq:
2322  *     str.match(pattern)   => matchdata or nil
2323  *
2324  *  Converts <i>pattern</i> to a <code>Regexp</code> (if it isn't already one),
2325  *  then invokes its <code>match</code> method on <i>str</i>.  If the second
2326  *  parameter is present, it specifies the position in the string to begin the
2327  *  search.
2328  *
2329  *     'hello'.match('(.)\1')      #=> #<MatchData "ll" 1:"l">
2330  *     'hello'.match('(.)\1')[0]   #=> "ll"
2331  *     'hello'.match(/(.)\1/)[0]   #=> "ll"
2332  *     'hello'.match('xx')         #=> nil
2333  *
2334  *  If a block is given, invoke the block with MatchData if match succeed, so
2335  *  that you can write
2336  *
2337  *     str.match(pat) {|m| ...}
2338  *
2339  *  instead of
2340  *
2341  *     if m = str.match(pat)
2342  *       ...
2343  *     end
2344  *
2345  *  The return value is a value from block execution in this case.
2346  */
2347
2348 static VALUE
2349 rb_str_match_m(int argc, VALUE *argv, VALUE str)
2350 {
2351     VALUE re, result;
2352     if (argc < 1)
2353         rb_raise(rb_eArgError, "wrong number of arguments (%d for 1)", argc);
2354     re = argv[0];
2355     argv[0] = str;
2356     result = rb_funcall2(get_pat(re, 0), rb_intern("match"), argc, argv);
2357     if (!NIL_P(result) && rb_block_given_p()) {
2358         return rb_yield(result);
2359     }
2360     return result;
2361 }
2362
2363 enum neighbor_char {
2364     NEIGHBOR_NOT_CHAR,
2365     NEIGHBOR_FOUND,
2366     NEIGHBOR_WRAPPED
2367 };
2368
2369 static enum neighbor_char
2370 enc_succ_char(char *p, int len, rb_encoding *enc)
2371 {
2372     int i, l;
2373     while (1) {
2374         for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
2375             p[i] = '\0';
2376         if (i < 0)
2377             return NEIGHBOR_WRAPPED;
2378         ++((unsigned char*)p)[i];
2379         l = rb_enc_precise_mbclen(p, p+len, enc);
2380         if (MBCLEN_CHARFOUND_P(l)) {
2381             l = MBCLEN_CHARFOUND_LEN(l);
2382             if (l == len) {
2383                 return NEIGHBOR_FOUND;
2384             }
2385             else {
2386                 memset(p+l, 0xff, len-l);
2387             }
2388         }
2389         if (MBCLEN_INVALID_P(l) && i < len-1) {
2390             int len2, l2;
2391             for (len2 = len-1; 0 < len2; len2--) {
2392                 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
2393                 if (!MBCLEN_INVALID_P(l2))
2394                     break;
2395             }
2396             memset(p+len2+1, 0xff, len-(len2+1));
2397         }
2398     }
2399 }
2400
2401 static enum neighbor_char
2402 enc_pred_char(char *p, int len, rb_encoding *enc)
2403 {
2404     int i, l;
2405     while (1) {
2406         for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
2407             p[i] = '\xff';
2408         if (i < 0)
2409             return NEIGHBOR_WRAPPED;
2410         --((unsigned char*)p)[i];
2411         l = rb_enc_precise_mbclen(p, p+len, enc);
2412         if (MBCLEN_CHARFOUND_P(l)) {
2413             l = MBCLEN_CHARFOUND_LEN(l);
2414             if (l == len) {
2415                 return NEIGHBOR_FOUND;
2416             }
2417             else {
2418                 memset(p+l, 0, len-l);
2419             }
2420         }
2421         if (MBCLEN_INVALID_P(l) && i < len-1) {
2422             int len2, l2;
2423             for (len2 = len-1; 0 < len2; len2--) {
2424                 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
2425                 if (!MBCLEN_INVALID_P(l2))
2426                     break;
2427             }
2428             memset(p+len2+1, 0, len-(len2+1));
2429         }
2430     }
2431 }
2432
2433 /*
2434   overwrite +p+ by succeeding letter in +enc+ and returns
2435   NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
2436   When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
2437   assuming each ranges are successive, and mbclen
2438   never change in each ranges.
2439   NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
2440   character.
2441  */
2442 static enum neighbor_char
2443 enc_succ_alnum_char(char *p, int len, rb_encoding *enc, char *carry)
2444 {
2445     enum neighbor_char ret;
2446     int c;
2447     int ctype;
2448     int range;
2449     char save[ONIGENC_CODE_TO_MBC_MAXLEN];
2450
2451     c = rb_enc_mbc_to_codepoint(p, p+len, enc);
2452     if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
2453         ctype = ONIGENC_CTYPE_DIGIT;
2454     else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
2455         ctype = ONIGENC_CTYPE_ALPHA;
2456     else
2457         return NEIGHBOR_NOT_CHAR;
2458
2459     MEMCPY(save, p, char, len);
2460     ret = enc_succ_char(p, len, enc);
2461     if (ret == NEIGHBOR_FOUND) {
2462         c = rb_enc_mbc_to_codepoint(p, p+len, enc);
2463         if (rb_enc_isctype(c, ctype, enc))
2464             return NEIGHBOR_FOUND;
2465     }
2466     MEMCPY(p, save, char, len);
2467     range = 1;
2468     while (1) {
2469         MEMCPY(save, p, char, len);
2470         ret = enc_pred_char(p, len, enc);
2471         if (ret == NEIGHBOR_FOUND) {
2472             c = rb_enc_mbc_to_codepoint(p, p+len, enc);
2473             if (!rb_enc_isctype(c, ctype, enc)) {
2474                 MEMCPY(p, save, char, len);
2475                 break;
2476             }
2477         }
2478         else {
2479             MEMCPY(p, save, char, len);
2480             break;
2481         }
2482         range++;
2483     }
2484     if (range == 1) {
2485         return NEIGHBOR_NOT_CHAR;
2486     }
2487
2488     if (ctype != ONIGENC_CTYPE_DIGIT) {
2489         MEMCPY(carry, p, char, len);
2490         return NEIGHBOR_WRAPPED;
2491     }
2492
2493     MEMCPY(carry, p, char, len);
2494     enc_succ_char(carry, len, enc);
2495     return NEIGHBOR_WRAPPED;
2496 }
2497
2498
2499 /*
2500  *  call-seq:
2501  *     str.succ   => new_str
2502  *     str.next   => new_str
2503  *
2504  *  Returns the successor to <i>str</i>. The successor is calculated by
2505  *  incrementing characters starting from the rightmost alphanumeric (or
2506  *  the rightmost character if there are no alphanumerics) in the
2507  *  string. Incrementing a digit always results in another digit, and
2508  *  incrementing a letter results in another letter of the same case.
2509  *  Incrementing nonalphanumerics uses the underlying character set's
2510  *  collating sequence.
2511  *
2512  *  If the increment generates a ``carry,'' the character to the left of
2513  *  it is incremented. This process repeats until there is no carry,
2514  *  adding an additional character if necessary.
2515  *
2516  *     "abcd".succ        #=> "abce"
2517  *     "THX1138".succ     #=> "THX1139"
2518  *     "<<koala>>".succ   #=> "<<koalb>>"
2519  *     "1999zzz".succ     #=> "2000aaa"
2520  *     "ZZZ9999".succ     #=> "AAAA0000"
2521  *     "***".succ         #=> "**+"
2522  */
2523
2524 VALUE
2525 rb_str_succ(VALUE orig)
2526 {
2527     rb_encoding *enc;
2528     VALUE str;
2529     char *sbeg, *s, *e;
2530     int c = -1;
2531     long l;
2532     char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
2533     int carry_pos = 0, carry_len = 1;
2534
2535     str = rb_str_new5(orig, RSTRING_PTR(orig), RSTRING_LEN(orig));
2536     rb_enc_cr_str_copy_for_substr(str, orig);
2537     OBJ_INFECT(str, orig);
2538     if (RSTRING_LEN(str) == 0) return str;
2539
2540     enc = STR_ENC_GET(orig);
2541     sbeg = RSTRING_PTR(str);
2542     s = e = sbeg + RSTRING_LEN(str);
2543
2544     while ((s = rb_enc_prev_char(sbeg, s, enc)) != 0) {
2545         enum neighbor_char neighbor;
2546         if ((l = rb_enc_precise_mbclen(s, e, enc)) <= 0) continue;
2547         neighbor = enc_succ_alnum_char(s, l, enc, carry);
2548         if (neighbor == NEIGHBOR_NOT_CHAR)
2549             continue;
2550         if (neighbor == NEIGHBOR_FOUND)
2551             return str;
2552         c = 1;
2553         carry_pos = s - sbeg;
2554         carry_len = l;
2555     }
2556     if (c == -1) {              /* str contains no alnum */
2557         s = e;
2558         while ((s = rb_enc_prev_char(sbeg, s, enc)) != 0) {
2559             enum neighbor_char neighbor;
2560             if ((l = rb_enc_precise_mbclen(s, e, enc)) <= 0) continue;
2561             neighbor = enc_succ_char(s, l, enc);
2562             if (neighbor == NEIGHBOR_FOUND)
2563                 return str;
2564             if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
2565                 /* wrapped to \0...\0.  search next valid char. */
2566                 enc_succ_char(s, l, enc);
2567             }
2568             if (!rb_enc_asciicompat(enc)) {
2569                 MEMCPY(carry, s, char, l);
2570                 carry_len = l;
2571             }
2572             carry_pos = s - sbeg;
2573         }
2574     }
2575     RESIZE_CAPA(str, RSTRING_LEN(str) + carry_len);
2576     s = RSTRING_PTR(str) + carry_pos;
2577     memmove(s + carry_len, s, RSTRING_LEN(str) - carry_pos);
2578     memmove(s, carry, carry_len);
2579     STR_SET_LEN(str, RSTRING_LEN(str) + carry_len);
2580     RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
2581     rb_enc_str_coderange(str);
2582     return str;
2583 }
2584
2585
2586 /*
2587  *  call-seq:
2588  *     str.succ!   => str
2589  *     str.next!   => str
2590  *
2591  *  Equivalent to <code>String#succ</code>, but modifies the receiver in
2592  *  place.
2593  */
2594
2595 static VALUE
2596 rb_str_succ_bang(VALUE str)
2597 {
2598     rb_str_shared_replace(str, rb_str_succ(str));
2599
2600     return str;
2601 }
2602
2603
2604 /*
2605  *  call-seq:
2606  *     str.upto(other_str, exclusive=false) {|s| block }   => str
2607  *
2608  *  Iterates through successive values, starting at <i>str</i> and
2609  *  ending at <i>other_str</i> inclusive, passing each value in turn to
2610  *  the block. The <code>String#succ</code> method is used to generate
2611  *  each value.  If optional second argument exclusive is omitted or is <code>false</code>,
2612  *  the last value will be included; otherwise it will be excluded.
2613  *
2614  *     "a8".upto("b6") {|s| print s, ' ' }
2615  *     for s in "a8".."b6"
2616  *       print s, ' '
2617  *     end
2618  *
2619  *  <em>produces:</em>
2620  *
2621  *     a8 a9 b0 b1 b2 b3 b4 b5 b6
2622  *     a8 a9 b0 b1 b2 b3 b4 b5 b6
2623  */
2624
2625 static VALUE
2626 rb_str_upto(int argc, VALUE *argv, VALUE beg)
2627 {
2628     VALUE end, exclusive;
2629     VALUE current, after_end;
2630     ID succ;
2631     int n, excl;
2632     rb_encoding *enc;
2633
2634     rb_scan_args(argc, argv, "11", &end, &exclusive);
2635     excl = RTEST(exclusive);
2636     succ = rb_intern("succ");
2637     StringValue(end);
2638     enc = rb_enc_check(beg, end);
2639     if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 &&
2640         is_ascii_string(beg) && is_ascii_string(end)) {
2641         char c = RSTRING_PTR(beg)[0];
2642         char e = RSTRING_PTR(end)[0];
2643
2644         if (c > e || (excl && c == e)) return beg;
2645         for (;;) {
2646             rb_yield(rb_enc_str_new(&c, 1, enc));
2647             if (!excl && c == e) break;
2648             c++;
2649             if (excl && c == e) break;
2650         }
2651         return beg;
2652     }
2653     n = rb_str_cmp(beg, end);
2654     if (n > 0 || (excl && n == 0)) return beg;
2655
2656     after_end = rb_funcall(end, succ, 0, 0);
2657     current = beg;
2658     while (!rb_str_equal(current, after_end)) {
2659         rb_yield(current);
2660         if (!excl && rb_str_equal(current, end)) break;
2661         current = rb_funcall(current, succ, 0, 0);
2662         StringValue(current);
2663         if (excl && rb_str_equal(current, end)) break;
2664         if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
2665             break;
2666     }
2667
2668     return beg;
2669 }
2670
2671 static VALUE
2672 rb_str_subpat(VALUE str, VALUE re, int nth)
2673 {
2674     if (rb_reg_search(re, str, 0, 0) >= 0) {
2675         return rb_reg_nth_match(nth, rb_backref_get());
2676     }
2677     return Qnil;
2678 }
2679
2680 static VALUE
2681 rb_str_aref(VALUE str, VALUE indx)
2682 {
2683     long idx;
2684
2685     switch (TYPE(indx)) {
2686       case T_FIXNUM:
2687         idx = FIX2LONG(indx);
2688
2689       num_index:
2690         str = rb_str_substr(str, idx, 1);
2691         if (!NIL_P(str) && RSTRING_LEN(str) == 0) return Qnil;
2692         return str;
2693
2694       case T_REGEXP:
2695         return rb_str_subpat(str, indx, 0);
2696
2697       case T_STRING:
2698         if (rb_str_index(str, indx, 0) != -1)
2699             return rb_str_dup(indx);
2700         return Qnil;
2701
2702       default:
2703         /* check if indx is Range */
2704         {
2705             long beg, len;
2706             VALUE tmp;
2707
2708             len = str_strlen(str, STR_ENC_GET(str));
2709             switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
2710               case Qfalse:
2711                 break;
2712               case Qnil:
2713                 return Qnil;
2714               default:
2715                 tmp = rb_str_substr(str, beg, len);
2716                 return tmp;
2717             }
2718         }
2719         idx = NUM2LONG(indx);
2720         goto num_index;
2721     }
2722     return Qnil;                /* not reached */
2723 }
2724
2725
2726 /*
2727  *  call-seq:
2728  *     str[fixnum]                 => new_str or nil
2729  *     str[fixnum, fixnum]         => new_str or nil
2730  *     str[range]                  => new_str or nil
2731  *     str[regexp]                 => new_str or nil
2732  *     str[regexp, fixnum]         => new_str or nil
2733  *     str[other_str]              => new_str or nil
2734  *     str.slice(fixnum)           => new_str or nil
2735  *     str.slice(fixnum, fixnum)   => new_str or nil
2736  *     str.slice(range)            => new_str or nil
2737  *     str.slice(regexp)           => new_str or nil
2738  *     str.slice(regexp, fixnum)   => new_str or nil
2739  *     str.slice(other_str)        => new_str or nil
2740  *
2741  *  Element Reference---If passed a single <code>Fixnum</code>, returns a
2742  *  substring of one character at that position. If passed two <code>Fixnum</code>
2743  *  objects, returns a substring starting at the offset given by the first, and
2744  *  a length given by the second. If given a range, a substring containing
2745  *  characters at offsets given by the range is returned. In all three cases, if
2746  *  an offset is negative, it is counted from the end of <i>str</i>. Returns
2747  *  <code>nil</code> if the initial offset falls outside the string, the length
2748  *  is negative, or the beginning of the range is greater than the end.
2749  *
2750  *  If a <code>Regexp</code> is supplied, the matching portion of <i>str</i> is
2751  *  returned. If a numeric parameter follows the regular expression, that
2752  *  component of the <code>MatchData</code> is returned instead. If a
2753  *  <code>String</code> is given, that string is returned if it occurs in
2754  *  <i>str</i>. In both cases, <code>nil</code> is returned if there is no
2755  *  match.
2756  *
2757  *     a = "hello there"
2758  *     a[1]                   #=> "e"
2759  *     a[1,3]                 #=> "ell"
2760  *     a[1..3]                #=> "ell"
2761  *     a[-3,2]                #=> "er"
2762  *     a[-4..-2]              #=> "her"
2763  *     a[12..-1]              #=> nil
2764  *     a[-2..-4]              #=> ""
2765  *     a[/[aeiou](.)\1/]      #=> "ell"
2766  *     a[/[aeiou](.)\1/, 0]   #=> "ell"
2767  *     a[/[aeiou](.)\1/, 1]   #=> "l"
2768  *     a[/[aeiou](.)\1/, 2]   #=> nil
2769  *     a["lo"]                #=> "lo"
2770  *     a["bye"]               #=> nil
2771  */
2772
2773 static VALUE
2774 rb_str_aref_m(int argc, VALUE *argv, VALUE str)
2775 {
2776     if (argc == 2) {
2777         if (TYPE(argv[0]) == T_REGEXP) {
2778             return rb_str_subpat(str, argv[0], NUM2INT(argv[1]));
2779         }
2780         return rb_str_substr(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]));
2781     }
2782     if (argc != 1) {
2783         rb_raise(rb_eArgError, "wrong number of arguments (%d for 1)", argc);
2784     }
2785     return rb_str_aref(str, argv[0]);
2786 }
2787
2788 static void
2789 rb_str_splice_0(VALUE str, long beg, long len, VALUE val)
2790 {
2791     rb_str_modify(str);
2792     if (len < RSTRING_LEN(val)) {
2793         /* expand string */
2794         RESIZE_CAPA(str, RSTRING_LEN(str) + RSTRING_LEN(val) - len + 1);
2795     }
2796
2797     if (RSTRING_LEN(val) != len) {
2798         memmove(RSTRING_PTR(str) + beg + RSTRING_LEN(val),
2799                 RSTRING_PTR(str) + beg + len,
2800                 RSTRING_LEN(str) - (beg + len));
2801     }
2802     if (RSTRING_LEN(val) < beg && len < 0) {
2803         MEMZERO(RSTRING_PTR(str) + RSTRING_LEN(str), char, -len);
2804     }
2805     if (RSTRING_LEN(val) > 0) {
2806         memmove(RSTRING_PTR(str)+beg, RSTRING_PTR(val), RSTRING_LEN(val));
2807     }
2808     STR_SET_LEN(str, RSTRING_LEN(str) + RSTRING_LEN(val) - len);
2809     if (RSTRING_PTR(str)) {
2810         RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
2811     }
2812     OBJ_INFECT(str, val);
2813 }
2814
2815 static void
2816 rb_str_splice(VALUE str, long beg, long len, VALUE val)
2817 {
2818     long slen;
2819     char *p, *e;
2820     rb_encoding *enc;
2821     int singlebyte = single_byte_optimizable(str);
2822
2823     if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
2824
2825     StringValue(val);
2826     rb_str_modify(str);
2827     enc = rb_enc_check(str, val);
2828     slen = str_strlen(str, enc);
2829
2830     if (slen < beg) {
2831       out_of_range:
2832         rb_raise(rb_eIndexError, "index %ld out of string", beg);
2833     }
2834     if (beg < 0) {
2835         if (-beg > slen) {
2836             goto out_of_range;
2837         }
2838         beg += slen;
2839     }
2840     if (slen < len || slen < beg + len) {
2841         len = slen - beg;
2842     }
2843     p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
2844     if (!p) p = RSTRING_END(str);
2845     e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
2846     if (!e) e = RSTRING_END(str);
2847     /* error check */
2848     beg = p - RSTRING_PTR(str); /* physical position */
2849     len = e - p;                /* physical length */
2850     rb_str_splice_0(str, beg, len, val);
2851     rb_enc_associate(str, enc);
2852 }
2853
2854 void
2855 rb_str_update(VALUE str, long beg, long len, VALUE val)
2856 {
2857     rb_str_splice(str, beg, len, val);
2858 }
2859
2860 static void
2861 rb_str_subpat_set(VALUE str, VALUE re, int nth, VALUE val)
2862 {
2863     VALUE match;
2864     long start, end, len;
2865     rb_encoding *enc;
2866     struct re_registers *regs;
2867
2868     if (rb_reg_search(re, str, 0, 0) < 0) {
2869         rb_raise(rb_eIndexError, "regexp not matched");
2870     }
2871     match = rb_backref_get();
2872     regs = RMATCH_REGS(match);
2873     if (nth >= regs->num_regs) {
2874       out_of_range:
2875         rb_raise(rb_eIndexError, "index %d out of regexp", nth);
2876     }
2877     if (nth < 0) {
2878         if (-nth >= regs->num_regs) {
2879             goto out_of_range;
2880         }
2881         nth += regs->num_regs;
2882     }
2883
2884     start = BEG(nth);
2885     if (start == -1) {
2886         rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
2887     }
2888     end = END(nth);
2889     len = end - start;
2890     StringValue(val);
2891     enc = rb_enc_check(str, val);
2892     rb_str_splice_0(str, start, len, val);
2893     rb_enc_associate(str, enc);
2894 }
2895
2896 static VALUE
2897 rb_str_aset(VALUE str, VALUE indx, VALUE val)
2898 {
2899     long idx, beg;
2900
2901     switch (TYPE(indx)) {
2902       case T_FIXNUM:
2903         idx = FIX2LONG(indx);
2904       num_index:
2905         rb_str_splice(str, idx, 1, val);
2906         return val;
2907
2908       case T_REGEXP:
2909         rb_str_subpat_set(str, indx, 0, val);
2910         return val;
2911
2912       case T_STRING:
2913         beg = rb_str_index(str, indx, 0);
2914         if (beg < 0) {
2915             rb_raise(rb_eIndexError, "string not matched");
2916         }
2917         beg = rb_str_sublen(str, beg);
2918         rb_str_splice(str, beg, str_strlen(indx, 0), val);
2919         return val;
2920
2921       default:
2922         /* check if indx is Range */
2923         {
2924             long beg, len;
2925             if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, 0), 2)) {
2926                 rb_str_splice(str, beg, len, val);
2927                 return val;
2928             }
2929         }
2930         idx = NUM2LONG(indx);
2931         goto num_index;
2932     }
2933 }
2934
2935 /*
2936  *  call-seq:
2937  *     str[fixnum] = new_str
2938  *     str[fixnum, fixnum] = new_str
2939  *     str[range] = aString
2940  *     str[regexp] = new_str
2941  *     str[regexp, fixnum] = new_str
2942  *     str[other_str] = new_str
2943  *
2944  *  Element Assignment---Replaces some or all of the content of <i>str</i>. The
2945  *  portion of the string affected is determined using the same criteria as
2946  *  <code>String#[]</code>. If the replacement string is not the same length as
2947  *  the text it is replacing, the string will be adjusted accordingly. If the
2948  *  regular expression or string is used as the index doesn't match a position
2949  *  in the string, <code>IndexError</code> is raised. If the regular expression
2950  *  form is used, the optional second <code>Fixnum</code> allows you to specify
2951  *  which portion of the match to replace (effectively using the
2952  *  <code>MatchData</code> indexing rules. The forms that take a
2953  *  <code>Fixnum</code> will raise an <code>IndexError</code> if the value is
2954  *  out of range; the <code>Range</code> form will raise a
2955  *  <code>RangeError</code>, and the <code>Regexp</code> and <code>String</code>
2956  *  forms will silently ignore the assignment.
2957  */
2958
2959 static VALUE
2960 rb_str_aset_m(int argc, VALUE *argv, VALUE str)
2961 {
2962     if (argc == 3) {
2963         if (TYPE(argv[0]) == T_REGEXP) {
2964             rb_str_subpat_set(str, argv[0], NUM2INT(argv[1]), argv[2]);
2965         }
2966         else {
2967             rb_str_splice(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
2968         }
2969         return argv[2];
2970     }
2971     if (argc != 2) {
2972         rb_raise(rb_eArgError, "wrong number of arguments (%d for 2)", argc);
2973     }
2974     return rb_str_aset(str, argv[0], argv[1]);
2975 }
2976
2977 /*
2978  *  call-seq:
2979  *     str.insert(index, other_str)   => str
2980  *
2981  *  Inserts <i>other_str</i> before the character at the given
2982  *  <i>index</i>, modifying <i>str</i>. Negative indices count from the
2983  *  end of the string, and insert <em>after</em> the given character.
2984  *  The intent is insert <i>aString</i> so that it starts at the given
2985  *  <i>index</i>.
2986  *
2987  *     "abcd".insert(0, 'X')    #=> "Xabcd"
2988  *     "abcd".insert(3, 'X')    #=> "abcXd"
2989  *     "abcd".insert(4, 'X')    #=> "abcdX"
2990  *     "abcd".insert(-3, 'X')   #=> "abXcd"
2991  *     "abcd".insert(-1, 'X')   #=> "abcdX"
2992  */
2993
2994 static VALUE
2995 rb_str_insert(VALUE str, VALUE idx, VALUE str2)
2996 {
2997     long pos = NUM2LONG(idx);
2998
2999     if (pos == -1) {
3000         return rb_str_append(str, str2);
3001     }
3002     else if (pos < 0) {
3003         pos++;
3004     }
3005     rb_str_splice(str, pos, 0, str2);
3006     return str;
3007 }
3008
3009
3010 /*
3011  *  call-seq:
3012  *     str.slice!(fixnum)           => fixnum or nil
3013  *     str.slice!(fixnum, fixnum)   => new_str or nil
3014  *     str.slice!(range)            => new_str or nil
3015  *     str.slice!(regexp)           => new_str or nil
3016  *     str.slice!(other_str)        => new_str or nil
3017  *
3018  *  Deletes the specified portion from <i>str</i>, and returns the portion
3019  *  deleted.
3020  *
3021  *     string = "this is a string"
3022  *     string.slice!(2)        #=> "i"
3023  *     string.slice!(3..6)     #=> " is "
3024  *     string.slice!(/s.*t/)   #=> "sa st"
3025  *     string.slice!("r")      #=> "r"
3026  *     string                  #=> "thing"
3027  */
3028
3029 static VALUE
3030 rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
3031 {
3032     VALUE result;
3033     VALUE buf[3];
3034     int i;
3035
3036     if (argc < 1 || 2 < argc) {
3037         rb_raise(rb_eArgError, "wrong number of arguments (%d for 1)", argc);
3038     }
3039     for (i=0; i<argc; i++) {
3040         buf[i] = argv[i];
3041     }
3042     rb_str_modify(str);
3043     buf[i] = rb_str_new(0,0);
3044     result = rb_str_aref_m(argc, buf, str);
3045     if (!NIL_P(result)) {
3046         rb_str_aset_m(argc+1, buf, str);
3047     }
3048     return result;
3049 }
3050
3051 static VALUE
3052 get_pat(VALUE pat, int quote)
3053 {
3054     VALUE val;
3055
3056     switch (TYPE(pat)) {
3057       case T_REGEXP:
3058         return pat;
3059
3060       case T_STRING:
3061         break;
3062
3063       default:
3064         val = rb_check_string_type(pat);
3065         if (NIL_P(val)) {
3066             Check_Type(pat, T_REGEXP);
3067         }
3068         pat = val;
3069     }
3070
3071     if (quote) {
3072         pat = rb_reg_quote(pat);
3073     }
3074
3075     return rb_reg_regcomp(pat);
3076 }
3077
3078
3079 /*
3080  *  call-seq:
3081  *     str.sub!(pattern, replacement)          => str or nil
3082  *     str.sub!(pattern) {|match| block }      => str or nil
3083  *
3084  *  Performs the substitutions of <code>String#sub</code> in place,
3085  *  returning <i>str</i>, or <code>nil</code> if no substitutions were
3086  *  performed.
3087  */
3088
3089 static VALUE
3090 rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
3091 {
3092     VALUE pat, repl, match, hash = Qnil;
3093     struct re_registers *regs;
3094     int iter = 0;
3095     int tainted = 0;
3096     long plen;
3097
3098     if (argc == 1 && rb_block_given_p()) {
3099         iter = 1;
3100     }
3101     else if (argc == 2) {
3102         repl = argv[1];
3103         hash = rb_check_convert_type(argv[1], T_HASH, "Hash", "to_hash");
3104         if (NIL_P(hash)) {
3105             StringValue(repl);
3106         }
3107         if (OBJ_TAINTED(repl)) tainted = 1;
3108     }
3109     else {
3110         rb_raise(rb_eArgError, "wrong number of arguments (%d for 2)", argc);
3111     }
3112
3113     pat = get_pat(argv[0], 1);
3114     if (rb_reg_search(pat, str, 0, 0) >= 0) {
3115         rb_encoding *enc;
3116         int cr = ENC_CODERANGE(str);
3117
3118         match = rb_backref_get();
3119         regs = RMATCH_REGS(match);
3120
3121         if (iter || !NIL_P(hash)) {
3122             char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
3123
3124             if (iter) {
3125                 rb_match_busy(match);
3126                 repl = rb_obj_as_string(rb_yield(rb_reg_nth_match(0, match)));
3127             }
3128             else {
3129                 repl = rb_hash_aref(hash, rb_str_subseq(str, BEG(0), END(0) - BEG(0)));
3130                 repl = rb_obj_as_string(repl);
3131             }
3132             str_mod_check(str, p, len);
3133             str_frozen_check(str);
3134             if (iter) rb_backref_set(match);
3135         }
3136         else {
3137             repl = rb_reg_regsub(repl, str, regs, pat);
3138         }
3139         enc = rb_enc_compatible(str, repl);
3140         if (!enc) {
3141             rb_encoding *str_enc = STR_ENC_GET(str);
3142             if (coderange_scan(RSTRING_PTR(str), BEG(0), str_enc) != ENC_CODERANGE_7BIT ||
3143                 coderange_scan(RSTRING_PTR(str)+END(0),
3144                                RSTRING_LEN(str)-END(0), str_enc) != ENC_CODERANGE_7BIT) {
3145                 rb_raise(rb_eArgError, "character encodings differ: %s and %s",
3146                          rb_enc_name(str_enc),
3147                          rb_enc_name(STR_ENC_GET(repl)));
3148             }
3149             enc = STR_ENC_GET(repl);
3150         }
3151         rb_str_modify(str);
3152         rb_enc_associate(str, enc);
3153         if (OBJ_TAINTED(repl)) tainted = 1;
3154         if (ENC_CODERANGE_UNKNOWN < cr && cr < ENC_CODERANGE_BROKEN) {
3155             int cr2 = ENC_CODERANGE(repl);
3156             if (cr2 == ENC_CODERANGE_UNKNOWN || cr2 > cr) cr = cr2;
3157         }
3158         plen = END(0) - BEG(0);
3159         if (RSTRING_LEN(repl) > plen) {
3160             RESIZE_CAPA(str, RSTRING_LEN(str) + RSTRING_LEN(repl) - plen);
3161         }
3162         if (RSTRING_LEN(repl) != plen) {
3163             memmove(RSTRING_PTR(str) + BEG(0) + RSTRING_LEN(repl),
3164                     RSTRING_PTR(str) + BEG(0) + plen,
3165                     RSTRING_LEN(str) - BEG(0) - plen);
3166         }
3167         memcpy(RSTRING_PTR(str) + BEG(0),
3168                RSTRING_PTR(repl), RSTRING_LEN(repl));
3169         STR_SET_LEN(str, RSTRING_LEN(str) + RSTRING_LEN(repl) - plen);
3170         RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
3171         ENC_CODERANGE_SET(str, cr);
3172         if (tainted) OBJ_TAINT(str);
3173
3174         return str;
3175     }
3176     return Qnil;
3177 }
3178
3179
3180 /*
3181  *  call-seq:
3182  *     str.sub(pattern, replacement)         => new_str
3183  *     str.sub(pattern) {|match| block }     => new_str
3184  *
3185  *  Returns a copy of <i>str</i> with the <em>first</em> occurrence of
3186  *  <i>pattern</i> replaced with either <i>replacement</i> or the value of the
3187  *  block. The <i>pattern</i> will typically be a <code>Regexp</code>; if it is
3188  *  a <code>String</code> then no regular expression metacharacters will be
3189  *  interpreted (that is <code>/\d/</code> will match a digit, but
3190  *  <code>'\d'</code> will match a backslash followed by a 'd').
3191  *
3192  *  If the method call specifies <i>replacement</i>, special variables such as
3193  *  <code>$&</code> will not be useful, as substitution into the string occurs
3194  *  before the pattern match starts. However, the sequences <code>\1</code>,
3195  *  <code>\2</code>, <code>\k<group_name></code>, etc., may be used.
3196  *
3197  *  In the block form, the current match string is passed in as a parameter, and
3198  *  variables such as <code>$1</code>, <code>$2</code>, <code>$`</code>,
3199  *  <code>$&</code>, and <code>$'</code> will be set appropriately. The value
3200  *  returned by the block will be substituted for the match on each call.
3201  *
3202  *  The result inherits any tainting in the original string or any supplied
3203  *  replacement string.
3204  *
3205  *     "hello".sub(/[aeiou]/, '*')                  #=> "h*llo"
3206  *     "hello".sub(/([aeiou])/, '<\1>')             #=> "h<e>llo"
3207  *     "hello".sub(/./) {|s| s[0].ord.to_s + ' ' }  #=> "104 ello"
3208  *     "hello".sub(/(?<foo>[aeiou])/, '*\k<foo>*')  #=> "h*e*llo"
3209  */
3210
3211 static VALUE
3212 rb_str_sub(int argc, VALUE *argv, VALUE str)
3213 {
3214     str = rb_str_dup(str);
3215     rb_str_sub_bang(argc, argv, str);
3216     return str;
3217 }
3218
3219 static VALUE
3220 str_gsub(int argc, VALUE *argv, VALUE str, int bang)
3221 {
3222     VALUE pat, val, repl, match, dest, hash = Qnil;
3223     struct re_registers *regs;
3224     long beg, n;
3225     long offset, blen, slen, len;
3226     int iter = 0;
3227     char *sp, *cp;
3228     int tainted = 0;
3229     rb_encoding *str_enc;
3230
3231     switch (argc) {
3232       case 1:
3233         RETURN_ENUMERATOR(str, argc, argv);
3234         iter = 1;
3235         break;
3236       case 2:
3237         repl = argv[1];
3238         hash = rb_check_convert_type(argv[1], T_HASH, "Hash", "to_hash");
3239         if (NIL_P(hash)) {
3240             StringValue(repl);
3241         }
3242         if (OBJ_TAINTED(repl)) tainted = 1;
3243         break;
3244       default:
3245         rb_raise(rb_eArgError, "wrong number of arguments (%d for 2)", argc);
3246     }
3247
3248     pat = get_pat(argv[0], 1);
3249     offset=0; n=0;
3250     beg = rb_reg_search(pat, str, 0, 0);
3251     if (beg < 0) {
3252         if (bang) return Qnil;  /* no match, no substitution */
3253         return rb_str_dup(str);
3254     }
3255
3256     blen = RSTRING_LEN(str) + 30; /* len + margin */
3257     dest = rb_str_buf_new(blen);
3258     sp = RSTRING_PTR(str);
3259     slen = RSTRING_LEN(str);
3260     cp = sp;
3261     str_enc = STR_ENC_GET(str);
3262
3263     do {
3264         n++;
3265         match = rb_backref_get();
3266         regs = RMATCH_REGS(match);
3267         if (iter || !NIL_P(hash)) {
3268             if (iter) {
3269                 rb_match_busy(match);
3270                 val = rb_obj_as_string(rb_yield(rb_reg_nth_match(0, match)));
3271             }
3272             else {
3273                 val = rb_hash_aref(hash, rb_str_subseq(str, BEG(0), END(0) - BEG(0)));
3274                 val = rb_obj_as_string(val);
3275             }
3276             str_mod_check(str, sp, slen);
3277             if (bang) str_frozen_check(str);
3278             if (val == dest) {  /* paranoid check [ruby-dev:24827] */
3279                 rb_raise(rb_eRuntimeError, "block should not cheat");
3280             }
3281             if (iter) rb_backref_set(match);
3282         }
3283         else {
3284             val = rb_reg_regsub(repl, str, regs, pat);
3285         }
3286
3287         if (OBJ_TAINTED(val)) tainted = 1;
3288
3289         len = beg - offset;     /* copy pre-match substr */
3290         if (len) {
3291             rb_enc_str_buf_cat(dest, cp, len, str_enc);
3292         }
3293
3294         rb_str_buf_append(dest, val);
3295
3296         offset = END(0);
3297         if (BEG(0) == END(0)) {
3298             /*
3299              * Always consume at least one character of the input string
3300              * in order to prevent infinite loops.
3301              */
3302             if (RSTRING_LEN(str) <= END(0)) break;
3303             len = rb_enc_mbclen(RSTRING_PTR(str)+END(0), RSTRING_END(str), str_enc);
3304             rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+END(0), len, str_enc);
3305             offset = END(0) + len;
3306         }
3307         cp = RSTRING_PTR(str) + offset;
3308         if (offset > RSTRING_LEN(str)) break;
3309         beg = rb_reg_search(pat, str, offset, 0);
3310     } while (beg >= 0);
3311     if (RSTRING_LEN(str) > offset) {
3312         rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
3313     }
3314     rb_backref_set(match);
3315     if (bang) {
3316         rb_str_shared_replace(str, dest);
3317     }
3318     else {
3319         RBASIC(dest)->klass = rb_obj_class(str);
3320         OBJ_INFECT(dest, str);
3321         str = dest;
3322     }
3323
3324     if (tainted) OBJ_TAINT(str);
3325     return str;
3326 }
3327
3328
3329 /*
3330  *  call-seq:
3331  *     str.gsub!(pattern, replacement)        => str or nil
3332  *     str.gsub!(pattern) {|match| block }    => str or nil
3333  *
3334  *  Performs the substitutions of <code>String#gsub</code> in place, returning
3335  *  <i>str</i>, or <code>nil</code> if no substitutions were performed.
3336  */
3337
3338 static VALUE
3339 rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
3340 {
3341     return str_gsub(argc, argv, str, 1);
3342 }
3343
3344
3345 /*
3346  *  call-seq:
3347  *     str.gsub(pattern, replacement)       => new_str
3348  *     str.gsub(pattern) {|match| block }   => new_str
3349  *
3350  *  Returns a copy of <i>str</i> with <em>all</em> occurrences of <i>pattern</i>
3351  *  replaced with either <i>replacement</i> or the value of the block. The
3352  *  <i>pattern</i> will typically be a <code>Regexp</code>; if it is a
3353  *  <code>String</code> then no regular expression metacharacters will be
3354  *  interpreted (that is <code>/\d/</code> will match a digit, but
3355  *  <code>'\d'</code> will match a backslash followed by a 'd').
3356  *
3357  *  If a string is used as the replacement, special variables from the match
3358  *  (such as <code>$&</code> and <code>$1</code>) cannot be substituted into it,
3359  *  as substitution into the string occurs before the pattern match
3360  *  starts. However, the sequences <code>\1</code>, <code>\2</code>,
3361  *  <code>\k<group_name></code>, and so on may be used to interpolate
3362  *  successive groups in the match.
3363  *
3364  *  In the block form, the current match string is passed in as a parameter, and
3365  *  variables such as <code>$1</code>, <code>$2</code>, <code>$`</code>,
3366  *  <code>$&</code>, and <code>$'</code> will be set appropriately. The value
3367  *  returned by the block will be substituted for the match on each call.
3368  *
3369  *  The result inherits any tainting in the original string or any supplied
3370  *  replacement string.
3371  *
3372  *     "hello".gsub(/[aeiou]/, '*')                  #=> "h*ll*"
3373  *     "hello".gsub(/([aeiou])/, '<\1>')             #=> "h<e>ll<o>"
3374  *     "hello".gsub(/./) {|s| s[0].ord.to_s + ' '}   #=> "104 101 108 108 111 "
3375  *     "hello".gsub(/(?<foo>[aeiou])/, '{\k<foo>}')  #=> "h{e}ll{o}"
3376  */
3377
3378 static VALUE
3379 rb_str_gsub(int argc, VALUE *argv, VALUE str)
3380 {
3381     return str_gsub(argc, argv, str, 0);
3382 }
3383
3384
3385 /*
3386  *  call-seq:
3387  *     str.replace(other_str)   => str
3388  *
3389  *  Replaces the contents and taintedness of <i>str</i> with the corresponding
3390  *  values in <i>other_str</i>.
3391  *
3392  *     s = "hello"         #=> "hello"
3393  *     s.replace "world"   #=> "world"
3394  */
3395
3396 static VALUE
3397 rb_str_replace(VALUE str, VALUE str2)
3398 {
3399     long len;
3400     if (str == str2) return str;
3401
3402     StringValue(str2);
3403     len = RSTRING_LEN(str2);
3404     if (STR_ASSOC_P(str2)) {
3405         str2 = rb_str_new4(str2);
3406     }
3407     if (STR_SHARED_P(str2)) {
3408         if (str_independent(str) && !STR_EMBED_P(str)) {
3409             free(RSTRING_PTR(str));
3410         }
3411         STR_SET_NOEMBED(str);
3412         RSTRING(str)->as.heap.len = len;
3413         RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
3414         FL_SET(str, ELTS_SHARED);
3415         FL_UNSET(str, STR_ASSOC);
3416         RSTRING(str)->as.heap.aux.shared = RSTRING(str2)->as.heap.aux.shared;
3417     }
3418     else {
3419         rb_str_modify(str);
3420         str_replace_shared(str, rb_str_new4(str2));
3421     }
3422
3423     OBJ_INFECT(str, str2);
3424     rb_enc_cr_str_exact_copy(str, str2);
3425     return str;
3426 }
3427
3428 /*
3429  *  call-seq:
3430  *     string.clear    ->  string
3431  *
3432  *  Makes string empty.
3433  *
3434  *     a = "abcde"
3435  *     a.clear    #=> ""
3436  */
3437
3438 static VALUE
3439 rb_str_clear(VALUE str)
3440 {
3441     /* rb_str_modify() */       /* no need for str_make_independent */
3442     if (str_independent(str) && !STR_EMBED_P(str)) {
3443         free(RSTRING_PTR(str));
3444     }
3445     STR_SET_EMBED(str);
3446     STR_SET_EMBED_LEN(str, 0);
3447     RSTRING_PTR(str)[0] = 0;
3448     ENC_CODERANGE_CLEAR(str);
3449     return str;
3450 }
3451
3452 /*
3453  *  call-seq:
3454  *     string.chr    ->  string
3455  *
3456  *  Returns a one-character string at the beginning of the string.
3457  *
3458  *     a = "abcde"
3459  *     a.chr    #=> "a"
3460  */
3461
3462 static VALUE
3463 rb_str_chr(VALUE str)
3464 {
3465     return rb_str_substr(str, 0, 1);
3466 }
3467
3468 /*
3469  *  call-seq:
3470  *     str.getbyte(index)          => 0 .. 255
3471  *
3472  *  returns the <i>index</i>th byte as an integer.
3473  */
3474 static VALUE
3475 rb_str_getbyte(VALUE str, VALUE index)
3476 {
3477     long pos = NUM2LONG(index);
3478
3479     if (pos < 0)
3480         pos += RSTRING_LEN(str);
3481     if (pos < 0 ||  RSTRING_LEN(str) <= pos)
3482         return Qnil;
3483
3484     return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
3485 }
3486
3487 /*
3488  *  call-seq:
3489  *     str.setbyte(index, int) => int
3490  *
3491  *  modifies the <i>index</i>th byte as <i>int</i>.
3492  */
3493 static VALUE
3494 rb_str_setbyte(VALUE str, VALUE index, VALUE value)
3495 {
3496     long pos = NUM2LONG(index);
3497     int byte = NUM2INT(value);
3498
3499     rb_str_modify(str);
3500
3501     if (pos < -RSTRING_LEN(str) || RSTRING_LEN(str) <= pos)
3502         rb_raise(rb_eIndexError, "index %ld out of string", pos);
3503     if (pos < 0)
3504         pos += RSTRING_LEN(str);
3505
3506     RSTRING_PTR(str)[pos] = byte;
3507
3508     return value;
3509 }
3510
3511 /*
3512  *  call-seq:
3513  *     str.reverse   => new_str
3514  *
3515  *  Returns a new string with the characters from <i>str</i> in reverse order.
3516  *
3517  *     "stressed".reverse   #=> "desserts"
3518  */
3519
3520 static VALUE
3521 rb_str_reverse(VALUE str)
3522 {
3523     rb_encoding *enc;
3524     VALUE obj;
3525     char *s, *e, *p;
3526
3527     if (RSTRING_LEN(str) <= 1) return rb_str_dup(str);
3528     enc = STR_ENC_GET(str);
3529     obj = rb_str_new5(str, 0, RSTRING_LEN(str));
3530     s = RSTRING_PTR(str); e = RSTRING_END(str);
3531     p = RSTRING_END(obj);
3532
3533     if (RSTRING_LEN(str) > 1) {
3534         if (single_byte_optimizable(str)) {
3535             while (s < e) {
3536                 *--p = *s++;
3537             }
3538         }
3539         else {
3540             while (s < e) {
3541                 int clen = rb_enc_mbclen(s, e, enc);
3542
3543                 p -= clen;
3544                 memcpy(p, s, clen);
3545                 s += clen;
3546             }
3547         }
3548     }
3549     STR_SET_LEN(obj, RSTRING_LEN(str));
3550     OBJ_INFECT(obj, str);
3551     rb_enc_cr_str_copy_for_substr(obj, str);
3552
3553     return obj;
3554 }
3555
3556
3557 /*
3558  *  call-seq:
3559  *     str.reverse!   => str
3560  *
3561  *  Reverses <i>str</i> in place.
3562  */
3563
3564 static VALUE
3565 rb_str_reverse_bang(VALUE str)
3566 {
3567     char *s, *e, c;
3568
3569     if (RSTRING_LEN(str) > 1) {
3570         rb_str_modify(str);
3571         s = RSTRING_PTR(str);
3572         e = RSTRING_END(str) - 1;
3573
3574         if (single_byte_optimizable(str)) {
3575             while (s < e) {
3576                 c = *s;
3577                 *s++ = *e;
3578                 *e-- = c;
3579             }
3580         }
3581         else {
3582             rb_str_shared_replace(str, rb_str_reverse(str));
3583         }
3584     }
3585     return str;
3586 }
3587
3588
3589 /*
3590  *  call-seq:
3591  *     str.include? other_str   => true or false
3592  *     str.include? fixnum      => true or false
3593  *
3594  *  Returns <code>true</code> if <i>str</i> contains the given string or
3595  *  character.
3596  *
3597  *     "hello".include? "lo"   #=> true
3598  *     "hello".include? "ol"   #=> false
3599  *     "hello".include? ?h     #=> true
3600  */
3601
3602 static VALUE
3603 rb_str_include(VALUE str, VALUE arg)
3604 {
3605     long i;
3606
3607     StringValue(arg);
3608     i = rb_str_index(str, arg, 0);
3609
3610     if (i == -1) return Qfalse;
3611     return Qtrue;
3612 }
3613
3614
3615 /*
3616  *  call-seq:
3617  *     str.to_i(base=10)   => integer
3618  *
3619  *  Returns the result of interpreting leading characters in <i>str</i> as an
3620  *  integer base <i>base</i> (between 2 and 36). Extraneous characters past the
3621  *  end of a valid number are ignored. If there is not a valid number at the
3622  *  start of <i>str</i>, <code>0</code> is returned. This method never raises an
3623  *  exception.
3624  *
3625  *     "12345".to_i             #=> 12345
3626  *     "99 red balloons".to_i   #=> 99
3627  *     "0a".to_i                #=> 0
3628  *     "0a".to_i(16)            #=> 10
3629  *     "hello".to_i             #=> 0
3630  *     "1100101".to_i(2)        #=> 101
3631  *     "1100101".to_i(8)        #=> 294977
3632  *     "1100101".to_i(10)       #=> 1100101
3633  *     "1100101".to_i(16)       #=> 17826049
3634  */
3635
3636 static VALUE
3637 rb_str_to_i(int argc, VALUE *argv, VALUE str)
3638 {
3639     int base;
3640
3641     if (argc == 0) base = 10;
3642     else {
3643         VALUE b;
3644
3645         rb_scan_args(argc, argv, "01", &b);
3646         base = NUM2INT(b);
3647     }
3648     if (base < 0) {
3649         rb_raise(rb_eArgError, "invalid radix %d", base);
3650     }
3651     return rb_str_to_inum(str, base, Qfalse);
3652 }
3653
3654
3655 /*
3656  *  call-seq:
3657  *     str.to_f   => float
3658  *
3659  *  Returns the result of interpreting leading characters in <i>str</i> as a
3660  *  floating point number. Extraneous characters past the end of a valid number
3661  *  are ignored. If there is not a valid number at the start of <i>str</i>,
3662  *  <code>0.0</code> is returned. This method never raises an exception.
3663  *
3664  *     "123.45e1".to_f        #=> 1234.5
3665  *     "45.67 degrees".to_f   #=> 45.67
3666  *     "thx1138".to_f         #=> 0.0
3667  */
3668
3669 static VALUE
3670 rb_str_to_f(VALUE str)
3671 {
3672     return DOUBLE2NUM(rb_str_to_dbl(str, Qfalse));
3673 }
3674
3675
3676 /*
3677  *  call-seq:
3678  *     str.to_s     => str
3679  *     str.to_str   => str
3680  *
3681  *  Returns the receiver.
3682  */
3683
3684 static VALUE
3685 rb_str_to_s(VALUE str)
3686 {
3687     if (rb_obj_class(str) != rb_cString) {
3688         VALUE dup = str_alloc(rb_cString);
3689         rb_str_replace(dup, str);
3690         return dup;
3691     }
3692     return str;
3693 }
3694
3695 static void
3696 str_cat_char(VALUE str, int c, rb_encoding *enc)
3697 {
3698     char s[16];
3699     int n = rb_enc_codelen(c, enc);
3700
3701     rb_enc_mbcput(c, s, enc);
3702     rb_enc_str_buf_cat(str, s, n, enc);
3703 }
3704
3705 static void
3706 prefix_escape(VALUE str, int c, rb_encoding *enc)
3707 {
3708     str_cat_char(str, '\\', enc);
3709     str_cat_char(str, c, enc);
3710 }
3711
3712 /*
3713  * call-seq:
3714  *   str.inspect   => string
3715  *
3716  * Returns a printable version of _str_, surrounded by quote marks,
3717  * with special characters escaped.
3718  *
3719  *    str = "hello"
3720  *    str[3] = "\b"
3721  *    str.inspect       #=> "\"hel\\bo\""
3722  */
3723
3724 VALUE
3725 rb_str_inspect(VALUE str)
3726 {
3727     rb_encoding *enc = STR_ENC_GET(str);
3728     char *p, *pend;
3729     VALUE result = rb_str_buf_new2("");
3730
3731     if (!rb_enc_asciicompat(enc)) enc = rb_usascii_encoding();
3732     rb_enc_associate(result, enc);
3733     str_cat_char(result, '"', enc);
3734     p = RSTRING_PTR(str); pend = RSTRING_END(str);
3735     while (p < pend) {
3736         int c;
3737         int n;
3738         int cc;
3739
3740         n = rb_enc_precise_mbclen(p, pend, enc);
3741         if (!MBCLEN_CHARFOUND_P(n)) {
3742             p++;
3743             n = 1;
3744             goto escape_codepoint;
3745         }
3746         n = MBCLEN_CHARFOUND_LEN(n);
3747
3748         c = rb_enc_codepoint(p, pend, enc);
3749         n = rb_enc_codelen(c, enc);
3750
3751         p += n;
3752         if (c == '"'|| c == '\\' ||
3753             (c == '#' &&
3754              p < pend &&
3755              MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p,pend,enc)) &&
3756              (cc = rb_enc_codepoint(p,pend,enc),
3757               (cc == '$' || cc == '@' || cc == '{')))) {
3758             prefix_escape(result, c, enc);
3759         }
3760         else if (c == '\n') {
3761             prefix_escape(result, 'n', enc);
3762         }
3763         else if (c == '\r') {
3764             prefix_escape(result, 'r', enc);
3765         }
3766         else if (c == '\t') {
3767             prefix_escape(result, 't', enc);
3768         }
3769         else if (c == '\f') {
3770             prefix_escape(result, 'f', enc);
3771         }
3772         else if (c == '\013') {
3773             prefix_escape(result, 'v', enc);
3774         }
3775         else if (c == '\010') {
3776             prefix_escape(result, 'b', enc);
3777         }
3778         else if (c == '\007') {
3779             prefix_escape(result, 'a', enc);
3780         }
3781         else if (c == 033) {
3782             prefix_escape(result, 'e', enc);
3783         }
3784         else if (rb_enc_isprint(c, enc)) {
3785             rb_enc_str_buf_cat(result, p-n, n, enc);
3786         }
3787         else {
3788             char buf[5];
3789             char *s;
3790             char *q;
3791
3792           escape_codepoint:
3793             for (q = p-n; q < p; q++) {
3794                 s = buf;
3795                 sprintf(buf, "\\x%02X", *q & 0377);
3796                 while (*s) {
3797                     str_cat_char(result, *s++, enc);
3798                 }
3799             }
3800         }
3801     }
3802     str_cat_char(result, '"', enc);
3803
3804     OBJ_INFECT(result, str);
3805     return result;
3806 }
3807
3808 #define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
3809
3810 /*
3811  *  call-seq:
3812  *     str.dump   => new_str
3813  *
3814  *  Produces a version of <i>str</i> with all nonprinting characters replaced by
3815  *  <code>\nnn</code> notation and all special characters escaped.
3816  */
3817
3818 VALUE
3819 rb_str_dump(VALUE str)
3820 {
3821     rb_encoding *enc0 = rb_enc_get(str);
3822     long len;
3823     const char *p, *pend;
3824     char *q, *qend;
3825     VALUE result;
3826
3827     len = 2;                    /* "" */
3828     p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
3829     while (p < pend) {
3830         unsigned char c = *p++;
3831         switch (c) {
3832           case '"':  case '\\':
3833           case '\n': case '\r':
3834           case '\t': case '\f':
3835           case '\013': case '\010': case '\007': case '\033':
3836             len += 2;
3837             break;
3838
3839           case '#':
3840             len += IS_EVSTR(p, pend) ? 2 : 1;
3841             break;
3842
3843           default:
3844             if (ISPRINT(c)) {
3845                 len++;
3846             }
3847             else {
3848                 len += 4;               /* \xNN */
3849             }
3850             break;
3851         }
3852     }
3853     if (!rb_enc_asciicompat(enc0)) {
3854         len += 19;              /* ".force_encoding('')" */
3855         len += strlen(enc0->name);
3856     }
3857
3858     result = rb_str_new5(str, 0, len);
3859     p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
3860     q = RSTRING_PTR(result); qend = q + len;
3861
3862     *q++ = '"';
3863     while (p < pend) {
3864         unsigned char c = *p++;
3865
3866         if (c == '"' || c == '\\') {
3867             *q++ = '\\';
3868             *q++ = c;
3869         }
3870         else if (c == '#') {
3871             if (IS_EVSTR(p, pend)) *q++ = '\\';
3872             *q++ = '#';
3873         }
3874         else if (c == '\n') {
3875             *q++ = '\\';
3876             *q++ = 'n';
3877         }
3878         else if (c == '\r') {
3879             *q++ = '\\';
3880             *q++ = 'r';
3881         }
3882         else if (c == '\t') {
3883             *q++ = '\\';
3884             *q++ = 't';
3885         }
3886         else if (c == '\f') {
3887             *q++ = '\\';
3888             *q++ = 'f';
3889         }
3890         else if (c == '\013') {
3891             *q++ = '\\';
3892             *q++ = 'v';
3893         }
3894         else if (c == '\010') {
3895             *q++ = '\\';
3896             *q++ = 'b';
3897         }
3898         else if (c == '\007') {
3899             *q++ = '\\';
3900             *q++ = 'a';
3901         }
3902         else if (c == '\033') {
3903             *q++ = '\\';
3904             *q++ = 'e';
3905         }
3906         else if (ISPRINT(c)) {
3907             *q++ = c;
3908         }
3909         else {
3910             *q++ = '\\';
3911             sprintf(q, "x%02X", c);
3912             q += 3;
3913         }
3914     }
3915     *q++ = '"';
3916     if (!rb_enc_asciicompat(enc0)) {
3917         sprintf(q, ".force_encoding(\"%s\")", enc0->name);
3918         enc0 = rb_ascii8bit_encoding();
3919     }
3920
3921     OBJ_INFECT(result, str);
3922     /* result from dump is ASCII */
3923     rb_enc_associate(result, enc0);
3924     return result;
3925 }
3926
3927
3928 /*
3929  *  call-seq:
3930  *     str.upcase!   => str or nil
3931  *
3932  *  Upcases the contents of <i>str</i>, returning <code>nil</code> if no changes
3933  *  were made.
3934  *  Note: case replacement is effective only in ASCII region.
3935  */
3936
3937 static VALUE
3938 rb_str_upcase_bang(VALUE str)
3939 {
3940     rb_encoding *enc;
3941     char *s, *send;
3942     int modify = 0;
3943     int cr = ENC_CODERANGE(str);
3944
3945     rb_str_modify(str);
3946     enc = STR_ENC_GET(str);
3947     s = RSTRING_PTR(str); send = RSTRING_END(str);
3948     while (s < send) {
3949         int c = rb_enc_codepoint(s, send, enc);
3950
3951         if (rb_enc_islower(c, enc)) {
3952             /* assuming toupper returns codepoint with same size */
3953             rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
3954             modify = 1;
3955         }
3956         s += rb_enc_codelen(c, enc);
3957     }
3958
3959     ENC_CODERANGE_SET(str, cr);
3960     if (modify) return str;
3961     return Qnil;
3962 }
3963
3964
3965 /*
3966  *  call-seq:
3967  *     str.upcase   => new_str
3968  *
3969  *  Returns a copy of <i>str</i> with all lowercase letters replaced with their
3970  *  uppercase counterparts. The operation is locale insensitive---only
3971  *  characters ``a'' to ``z'' are affected.
3972  *  Note: case replacement is effective only in ASCII region.
3973  *
3974  *     "hEllO".upcase   #=> "HELLO"
3975  */
3976
3977 static VALUE
3978 rb_str_upcase(VALUE str)
3979 {
3980     str = rb_str_dup(str);
3981     rb_str_upcase_bang(str);
3982     return str;
3983 }
3984
3985
3986 /*
3987  *  call-seq:
3988  *     str.downcase!   => str or nil
3989  *
3990  *  Downcases the contents of <i>str</i>, returning <code>nil</code> if no
3991  *  changes were made.
3992  *  Note: case replacement is effective only in ASCII region.
3993  */
3994
3995 static VALUE
3996 rb_str_downcase_bang(VALUE str)
3997 {
3998     rb_encoding *enc;
3999     char *s, *send;
4000     int modify = 0;
4001     int cr = ENC_CODERANGE(str);
4002
4003     rb_str_modify(str);
4004     enc = STR_ENC_GET(str);
4005     s = RSTRING_PTR(str); send = RSTRING_END(str);
4006     while (s < send) {
4007         int c = rb_enc_codepoint(s, send, enc);
4008
4009         if (rb_enc_isupper(c, enc)) {
4010             /* assuming toupper returns codepoint with same size */
4011             rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
4012             modify = 1;
4013         }
4014         s += rb_enc_codelen(c, enc);
4015     }
4016
4017     ENC_CODERANGE_SET(str, cr);
4018     if (modify) return str;
4019     return Qnil;
4020 }
4021
4022
4023 /*
4024  *  call-seq:
4025  *     str.downcase   => new_str
4026  *
4027  *  Returns a copy of <i>str</i> with all uppercase letters replaced with their
4028  *  lowercase counterparts. The operation is locale insensitive---only
4029  *  characters ``A'' to ``Z'' are affected.
4030  *  Note: case replacement is effective only in ASCII region.
4031  *
4032  *     "hEllO".downcase   #=> "hello"
4033  */
4034
4035 static VALUE
4036 rb_str_downcase(VALUE str)
4037 {
4038     str = rb_str_dup(str);
4039     rb_str_downcase_bang(str);
4040     return str;
4041 }
4042
4043
4044 /*
4045  *  call-seq:
4046  *     str.capitalize!   => str or nil
4047  *
4048  *  Modifies <i>str</i> by converting the first character to uppercase and the
4049  *  remainder to lowercase. Returns <code>nil</code> if no changes are made.
4050  *  Note: case conversion is effective only in ASCII region.
4051  *
4052  *     a = "hello"
4053  *     a.capitalize!   #=> "Hello"
4054  *     a               #=> "Hello"
4055  *     a.capitalize!   #=> nil
4056  */
4057
4058 static VALUE
4059 rb_str_capitalize_bang(VALUE str)
4060 {
4061     rb_encoding *enc;
4062     char *s, *send;
4063     int modify = 0;
4064     int c;
4065     int cr = ENC_CODERANGE(str);
4066
4067     rb_str_modify(str);
4068     enc = STR_ENC_GET(str);
4069     if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
4070     s = RSTRING_PTR(str); send = RSTRING_END(str);
4071
4072     c = rb_enc_codepoint(s, send, enc);
4073     if (rb_enc_islower(c, enc)) {
4074         rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
4075         modify = 1;
4076     }
4077     s += rb_enc_codelen(c, enc);
4078     while (s < send) {
4079         c = rb_enc_codepoint(s, send, enc);
4080         if (rb_enc_isupper(c, enc)) {
4081             rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
4082             modify = 1;
4083         }
4084         s += rb_enc_codelen(c, enc);
4085     }
4086
4087     ENC_CODERANGE_SET(str, cr);
4088     if (modify) return str;
4089     return Qnil;
4090 }
4091
4092
4093 /*
4094  *  call-seq:
4095  *     str.capitalize   => new_str
4096  *
4097  *  Returns a copy of <i>str</i> with the first character converted to uppercase
4098  *  and the remainder to lowercase.
4099  *  Note: case conversion is effective only in ASCII region.
4100  *
4101  *     "hello".capitalize    #=> "Hello"
4102  *     "HELLO".capitalize    #=> "Hello"
4103  *     "123ABC".capitalize   #=> "123abc"
4104  */
4105
4106 static VALUE
4107 rb_str_capitalize(VALUE str)
4108 {
4109     str = rb_str_dup(str);
4110     rb_str_capitalize_bang(str);
4111     return str;
4112 }
4113
4114
4115 /*
4116  *  call-seq:
4117 *     str.swapcase!   => str or nil
4118  *
4119  *  Equivalent to <code>String#swapcase</code>, but modifies the receiver in
4120  *  place, returning <i>str</i>, or <code>nil</code> if no changes were made.
4121  *  Note: case conversion is effective only in ASCII region.
4122  */
4123
4124 static VALUE
4125 rb_str_swapcase_bang(VALUE str)
4126 {
4127     rb_encoding *enc;
4128     char *s, *send;
4129     int modify = 0;
4130     int cr = ENC_CODERANGE(str);
4131
4132     rb_str_modify(str);
4133     enc = STR_ENC_GET(str);
4134     s = RSTRING_PTR(str); send = RSTRING_END(str);
4135     while (s < send) {
4136         int c = rb_enc_codepoint(s, send, enc);
4137
4138         if (rb_enc_isupper(c, enc)) {
4139             /* assuming toupper returns codepoint with same size */
4140             rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
4141             modify = 1;
4142         }
4143         else if (rb_enc_islower(c, enc)) {
4144             /* assuming toupper returns codepoint with same size */
4145             rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
4146             modify = 1;
4147         }
4148         s += rb_enc_codelen(c, enc);
4149     }
4150
4151     ENC_CODERANGE_SET(str, cr);
4152     if (modify) return str;
4153     return Qnil;
4154 }
4155
4156
4157 /*
4158  *  call-seq:
4159  *     str.swapcase   => new_str
4160  *
4161  *  Returns a copy of <i>str</i> with uppercase alphabetic characters converted
4162  *  to lowercase and lowercase characters converted to uppercase.
4163  *  Note: case conversion is effective only in ASCII region.
4164  *
4165  *     "Hello".swapcase          #=> "hELLO"
4166  *     "cYbEr_PuNk11".swapcase   #=> "CyBeR_pUnK11"
4167  */
4168
4169 static VALUE
4170 rb_str_swapcase(VALUE str)
4171 {
4172     str = rb_str_dup(str);
4173     rb_str_swapcase_bang(str);
4174     return str;
4175 }
4176
4177 typedef unsigned char *USTR;
4178
4179 struct tr {
4180     int gen, now, max;
4181     char *p, *pend;
4182 };
4183
4184 static int
4185 trnext(struct tr *t, rb_encoding *enc)
4186 {
4187     for (;;) {
4188         if (!t->gen) {
4189             if (t->p == t->pend) return -1;
4190             if (t->p < t->pend - 1 && *t->p == '\\') {
4191                 t->p++;
4192             }
4193             t->now = rb_enc_codepoint(t->p, t->pend, enc);
4194             t->p += rb_enc_codelen(t->now, enc);
4195             if (t->p < t->pend - 1 && *t->p == '-') {
4196                 t->p++;
4197                 if (t->p < t->pend) {
4198                     int c = rb_enc_codepoint(t->p, t->pend, enc);
4199                     t->p += rb_enc_codelen(c, enc);
4200                     if (t->now > c) continue;
4201                     t->gen = 1;
4202                     t->max = c;
4203                 }
4204             }
4205             return t->now;
4206         }
4207         else if (++t->now < t->max) {
4208             return t->now;
4209         }
4210         else {
4211             t->gen = 0;
4212             return t->max;
4213         }
4214     }
4215 }
4216
4217 static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
4218
4219 static VALUE
4220 tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
4221 {
4222     SIGNED_VALUE trans[256];
4223     rb_encoding *enc, *e1, *e2;
4224     struct tr trsrc, trrepl;
4225     int cflag = 0;
4226     int c, last = 0, modify = 0, i;
4227     char *s, *send;
4228     VALUE hash = 0;
4229
4230     StringValue(src);
4231     StringValue(repl);
4232     if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
4233     trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
4234     if (RSTRING_LEN(src) >= 2 && RSTRING_PTR(src)[0] == '^') {
4235         cflag++;
4236         trsrc.p++;
4237     }
4238     if (RSTRING_LEN(repl) == 0) {
4239         return rb_str_delete_bang(1, &src, str);
4240     }
4241     e1 = rb_enc_check(str, src);
4242     e2 = rb_enc_check(str, repl);
4243     if (e1 == e2) {
4244         enc = e1;
4245     }
4246     else {
4247         enc = rb_enc_check(src, repl);
4248     }
4249     trrepl.p = RSTRING_PTR(repl);
4250     trrepl.pend = trrepl.p + RSTRING_LEN(repl);
4251     trsrc.gen = trrepl.gen = 0;
4252     trsrc.now = trrepl.now = 0;
4253     trsrc.max = trrepl.max = 0;
4254
4255     if (cflag) {
4256         for (i=0; i<256; i++) {
4257             trans[i] = 1;
4258         }
4259         while ((c = trnext(&trsrc, enc)) >= 0) {
4260             if (c < 256) {
4261                 trans[c] = -1;
4262             }
4263             else {
4264                 if (!hash) hash = rb_hash_new();
4265                 rb_hash_aset(hash, INT2NUM(c), Qtrue);
4266             }
4267         }
4268         while ((c = trnext(&trrepl, enc)) >= 0)
4269             /* retrieve last replacer */;
4270         last = trrepl.now;
4271         for (i=0; i<256; i++) {
4272             if (trans[i] >= 0) {
4273                 trans[i] = last;
4274             }
4275         }
4276     }
4277     else {
4278         int r;
4279
4280         for (i=0; i<256; i++) {
4281             trans[i] = -1;
4282         }
4283         while ((c = trnext(&trsrc, enc)) >= 0) {
4284             r = trnext(&trrepl, enc);
4285             if (r == -1) r = trrepl.now;
4286             if (c < 256) {
4287                 trans[c] = INT2NUM(r);
4288             }
4289             else {
4290                 if (!hash) hash = rb_hash_new();
4291                 rb_hash_aset(hash, INT2NUM(c), INT2NUM(r));
4292             }
4293         }
4294     }
4295
4296     rb_str_modify(str);
4297     s = RSTRING_PTR(str); send = RSTRING_END(str);
4298     if (sflag) {
4299         int clen, tlen, max = RSTRING_LEN(str);
4300         int offset, save = -1;
4301         char *buf = ALLOC_N(char, max), *t = buf;
4302         VALUE v;
4303
4304         if (cflag) tlen = rb_enc_codelen(last, enc);
4305         while (s < send) {
4306             c = rb_enc_codepoint(s, send, enc);
4307             tlen = clen = rb_enc_codelen(c, enc);
4308
4309             s += clen;
4310             if (c < 256) {
4311                 v = trans[c] >= 0 ? trans[c] : Qnil;
4312             }
4313             else {
4314                 v = hash ? rb_hash_aref(hash, INT2NUM(c)) : Qnil;
4315             }
4316             if (!NIL_P(v)) {
4317                 if (!cflag) {
4318                     c = NUM2INT(v);
4319                     if (save == c) continue;
4320                     save = c;
4321                     tlen = rb_enc_codelen(c, enc);
4322                     modify = 1;
4323                 }
4324                 else {
4325                     save = c = last;
4326                     modify = 1;
4327                 }
4328             }
4329             else {
4330                 save = -1;
4331             }
4332             while (t - buf + tlen >= max) {
4333                 offset = t - buf;
4334                 max *= 2;
4335                 REALLOC_N(buf, char, max);
4336                 t = buf + offset;
4337             }
4338             rb_enc_mbcput(c, t, enc);
4339             t += tlen;
4340         }
4341         *t = '\0';
4342         RSTRING(str)->as.heap.ptr = buf;
4343         RSTRING(str)->as.heap.len = t - buf;
4344         STR_SET_NOEMBED(str);
4345         RSTRING(str)->as.heap.aux.capa = max;
4346     }
4347     else if (rb_enc_mbmaxlen(enc) == 1) {
4348         while (s < send) {
4349             c = (unsigned char)*s;
4350             if (trans[c] >= 0) {
4351                 if (!cflag) {
4352                     c = FIX2INT(trans[c]);
4353                     *s = c;
4354                     modify = 1;
4355                 }
4356                 else {
4357                     *s = last;
4358                     modify = 1;
4359                 }
4360             }
4361             s++;
4362         }
4363     }
4364     else {
4365         int clen, tlen, max = RSTRING_LEN(str) * 1.2;
4366         int offset;
4367         char *buf = ALLOC_N(char, max), *t = buf;
4368         VALUE v;
4369
4370         if (cflag) tlen = rb_enc_codelen(last, enc);
4371         while (s < send) {
4372             c = rb_enc_codepoint(s, send, enc);
4373             tlen = clen = rb_enc_codelen(c, enc);
4374
4375             if (c < 256) {
4376                 v = trans[c] >= 0 ? trans[c] : Qnil;
4377             }
4378             else {
4379                 v = hash ? rb_hash_aref(hash, INT2NUM(c)) : Qnil;
4380             }
4381             if (!NIL_P(v)) {
4382                 if (!cflag) {
4383                     c = NUM2INT(v);
4384                     tlen = rb_enc_codelen(c, enc);
4385                     modify = 1;
4386                 }
4387                 else {
4388                     c = last;
4389                     modify = 1;
4390                 }
4391             }
4392             while (t - buf + tlen >= max) {
4393                 offset = t - buf;
4394                 max *= 2;
4395                 REALLOC_N(buf, char, max);
4396                 t = buf + offset;
4397             }
4398             if (s != t) rb_enc_mbcput(c, t, enc);
4399             s += clen;
4400             t += tlen;
4401         }
4402         if (!STR_EMBED_P(str)) {
4403             free(RSTRING(str)->as.heap.ptr);
4404         }
4405         *t = '\0';
4406         RSTRING(str)->as.heap.ptr = buf;
4407         RSTRING(str)->as.heap.len = t - buf;
4408         STR_SET_NOEMBED(str);
4409         RSTRING(str)->as.heap.aux.capa = max;
4410     }
4411
4412     if (modify) {
4413         rb_enc_associate(str, enc);
4414         return str;
4415     }
4416     return Qnil;
4417 }
4418
4419
4420 /*
4421  *  call-seq:
4422  *     str.tr!(from_str, to_str)   => str or nil
4423  *
4424  *  Translates <i>str</i> in place, using the same rules as
4425  *  <code>String#tr</code>. Returns <i>str</i>, or <code>nil</code> if no
4426  *  changes were made.
4427  */
4428
4429 static VALUE
4430 rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
4431 {
4432     return tr_trans(str, src, repl, 0);
4433 }
4434
4435
4436 /*
4437  *  call-seq:
4438  *     str.tr(from_str, to_str)   => new_str
4439  *
4440  *  Returns a copy of <i>str</i> with the characters in <i>from_str</i> replaced
4441  *  by the corresponding characters in <i>to_str</i>. If <i>to_str</i> is
4442  *  shorter than <i>from_str</i>, it is padded with its last character. Both
4443  *  strings may use the c1--c2 notation to denote ranges of characters, and
4444  *  <i>from_str</i> may start with a <code>^</code>, which denotes all
4445  *  characters except those listed.
4446  *
4447  *     "hello".tr('aeiou', '*')    #=> "h*ll*"
4448  *     "hello".tr('^aeiou', '*')   #=> "*e**o"
4449  *     "hello".tr('el', 'ip')      #=> "hippo"
4450  *     "hello".tr('a-y', 'b-z')    #=> "ifmmp"
4451  */
4452
4453 static VALUE
4454 rb_str_tr(VALUE str, VALUE src, VALUE repl)
4455 {
4456     str = rb_str_dup(str);
4457     tr_trans(str, src, repl, 0);
4458     return str;
4459 }
4460
4461 static void
4462 tr_setup_table(VALUE str, char stable[256], int first,
4463                VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
4464 {
4465     char buf[256];
4466     struct tr tr;
4467     int c, l;
4468     VALUE table = 0, ptable = 0;
4469     int i, cflag = 0;
4470
4471     tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
4472     tr.gen = tr.now = tr.max = 0;
4473
4474     if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
4475         cflag = 1;
4476         tr.p += l;
4477     }
4478     if (first) {
4479         for (i=0; i<256; i++) {
4480             stable[i] = 1;
4481         }
4482     }
4483     for (i=0; i<256; i++) {
4484         buf[i] = cflag;
4485     }
4486
4487     while ((c = trnext(&tr, enc)) >= 0) {
4488         if (c < 256) {
4489             buf[c & 0xff] = !cflag;
4490         }
4491         else {
4492             VALUE key = INT2NUM(c);
4493
4494             if (!table) {
4495                 table = rb_hash_new();
4496                 if (cflag) {
4497                     ptable = *ctablep;
4498                     *ctablep = table;
4499                 }
4500                 else {
4501                     ptable = *tablep;
4502                     *tablep = table;
4503                 }
4504             }
4505             if (!ptable || !NIL_P(rb_hash_aref(ptable, key))) {
4506                 rb_hash_aset(table, key, Qtrue);
4507             }
4508         }
4509     }
4510     for (i=0; i<256; i++) {
4511         stable[i] = stable[i] && buf[i];
4512     }
4513 }
4514
4515
4516 static int
4517 tr_find(int c, char table[256], VALUE del, VALUE nodel)
4518 {
4519     if (c < 256) {
4520         return table[c] ? Qtrue : Qfalse;
4521     }
4522     else {
4523         VALUE v = INT2NUM(c);
4524
4525         if (!del || NIL_P(rb_hash_lookup(del, v))) {
4526             return Qfalse;
4527         }
4528         if (nodel && NIL_P(rb_hash_lookup(nodel, v)))
4529             return Qfalse;
4530         return Qtrue;
4531     }
4532 }
4533
4534 /*
4535  *  call-seq:
4536  *     str.delete!([other_str]+)   => str or nil
4537  *
4538  *  Performs a <code>delete</code> operation in place, returning <i>str</i>, or
4539  *  <code>nil</code> if <i>str</i> was not modified.
4540  */
4541
4542 static VALUE
4543 rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
4544 {
4545     char squeez[256];
4546     rb_encoding *enc = 0;
4547     char *s, *send, *t;
4548     VALUE del = 0, nodel = 0;
4549     int modify = 0;
4550     int i;
4551     int cr = ENC_CODERANGE(str);
4552
4553     if (argc < 1) {
4554         rb_raise(rb_eArgError, "wrong number of arguments");
4555     }
4556     for (i=0; i<argc; i++) {
4557         VALUE s = argv[i];
4558
4559         StringValue(s);
4560         enc = rb_enc_check(str, s);
4561         tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
4562     }
4563
4564     rb_str_modify(str);
4565     s = t = RSTRING_PTR(str);
4566     if (!s || RSTRING_LEN(str) == 0) return Qnil;
4567     send = RSTRING_END(str);
4568     while (s < send) {
4569         int c = rb_enc_codepoint(s, send, enc);
4570         int clen = rb_enc_codelen(c, enc);
4571
4572         if (tr_find(c, squeez, del, nodel)) {
4573             modify = 1;
4574         }
4575         else {
4576             if (t != s) rb_enc_mbcput(c, t, enc);
4577             t += clen;
4578         }
4579         s += clen;
4580     }
4581     *t = '\0';
4582     STR_SET_LEN(str, t - RSTRING_PTR(str));
4583
4584     ENC_CODERANGE_SET(str, cr);
4585     if (modify) return str;
4586     return Qnil;
4587 }
4588
4589
4590 /*
4591  *  call-seq:
4592  *     str.delete([other_str]+)   => new_str
4593  *
4594  *  Returns a copy of <i>str</i> with all characters in the intersection of its
4595  *  arguments deleted. Uses the same rules for building the set of characters as
4596  *  <code>String#count</code>.
4597  *
4598  *     "hello".delete "l","lo"        #=> "heo"
4599  *     "hello".delete "lo"            #=> "he"
4600  *     "hello".delete "aeiou", "^e"   #=> "hell"
4601  *     "hello".delete "ej-m"          #=> "ho"
4602  */
4603
4604 static VALUE
4605 rb_str_delete(int argc, VALUE *argv, VALUE str)
4606 {
4607     str = rb_str_dup(str);
4608     rb_str_delete_bang(argc, argv, str);
4609     return str;
4610 }
4611
4612
4613 /*
4614  *  call-seq:
4615  *     str.squeeze!([other_str]*)   => str or nil
4616  *
4617  *  Squeezes <i>str</i> in place, returning either <i>str</i>, or
4618  *  <code>nil</code> if no changes were made.
4619  */
4620
4621 static VALUE
4622 rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
4623 {
4624     char squeez[256];
4625     rb_encoding *enc = 0;
4626     VALUE del = 0, nodel = 0;
4627     char *s, *send, *t;
4628     int save, modify = 0;
4629     int i;
4630
4631     if (argc == 0) {
4632         enc = STR_ENC_GET(str);
4633     }
4634     else {
4635         for (i=0; i<argc; i++) {
4636             VALUE s = argv[i];
4637
4638             StringValue(s);
4639             enc = rb_enc_check(str, s);
4640             tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
4641         }
4642     }
4643
4644     rb_str_modify(str);
4645     s = t = RSTRING_PTR(str);
4646     if (!s || RSTRING_LEN(str) == 0) return Qnil;
4647     send = RSTRING_END(str);
4648     save = -1;
4649     while (s < send) {
4650         int c = rb_enc_codepoint(s, send, enc);
4651         int clen = rb_enc_codelen(c, enc);
4652
4653         if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
4654             if (t != s) rb_enc_mbcput(c, t, enc);
4655             save = c;
4656             t += clen;
4657         }
4658         s += clen;
4659     }
4660     *t = '\0';
4661     if (t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
4662         STR_SET_LEN(str, t - RSTRING_PTR(str));
4663         modify = 1;
4664     }
4665
4666     if (modify) return str;
4667     return Qnil;
4668 }
4669
4670
4671 /*
4672  *  call-seq:
4673  *     str.squeeze([other_str]*)    => new_str
4674  *
4675  *  Builds a set of characters from the <i>other_str</i> parameter(s) using the
4676  *  procedure described for <code>String#count</code>. Returns a new string
4677  *  where runs of the same character that occur in this set are replaced by a
4678  *  single character. If no arguments are given, all runs of identical
4679  *  characters are replaced by a single character.
4680  *
4681  *     "yellow moon".squeeze                  #=> "yelow mon"
4682  *     "  now   is  the".squeeze(" ")         #=> " now is the"
4683  *     "putters shoot balls".squeeze("m-z")   #=> "puters shot balls"
4684  */
4685
4686 static VALUE
4687 rb_str_squeeze(int argc, VALUE *argv, VALUE str)
4688 {
4689     str = rb_str_dup(str);
4690     rb_str_squeeze_bang(argc, argv, str);
4691     return str;
4692 }
4693
4694
4695 /*
4696  *  call-seq:
4697  *     str.tr_s!(from_str, to_str)   => str or nil
4698  *
4699  *  Performs <code>String#tr_s</code> processing on <i>str</i> in place,
4700  *  returning <i>str</i>, or <code>nil</code> if no changes were made.
4701  */
4702
4703 static VALUE
4704 rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
4705 {
4706     return tr_trans(str, src, repl, 1);
4707 }
4708
4709
4710 /*
4711  *  call-seq:
4712  *     str.tr_s(from_str, to_str)   => new_str
4713  *
4714  *  Processes a copy of <i>str</i> as described under <code>String#tr</code>,
4715  *  then removes duplicate characters in regions that were affected by the
4716  *  translation.
4717  *
4718  *     "hello".tr_s('l', 'r')     #=> "hero"
4719  *     "hello".tr_s('el', '*')    #=> "h*o"
4720  *     "hello".tr_s('el', 'hx')   #=> "hhxo"
4721  */
4722
4723 static VALUE
4724 rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
4725 {
4726     str = rb_str_dup(str);
4727     tr_trans(str, src, repl, 1);
4728     return str;
4729 }
4730
4731
4732 /*
4733  *  call-seq:
4734  *     str.count([other_str]+)   => fixnum
4735  *
4736  *  Each <i>other_str</i> parameter defines a set of characters to count.  The
4737  *  intersection of these sets defines the characters to count in
4738  *  <i>str</i>. Any <i>other_str</i> that starts with a caret (^) is
4739  *  negated. The sequence c1--c2 means all characters between c1 and c2.
4740  *
4741  *     a = "hello world"
4742  *     a.count "lo"            #=> 5
4743  *     a.count "lo", "o"       #=> 2
4744  *     a.count "hello", "^l"   #=> 4
4745  *     a.count "ej-m"          #=> 4
4746  */
4747
4748 static VALUE
4749 rb_str_count(int argc, VALUE *argv, VALUE str)
4750 {
4751     char table[256];
4752     rb_encoding *enc = 0;
4753     VALUE del = 0, nodel = 0;
4754     char *s, *send;
4755     int i;
4756
4757     if (argc < 1) {
4758         rb_raise(rb_eArgError, "wrong number of arguments");
4759     }
4760     for (i=0; i<argc; i++) {
4761         VALUE s = argv[i];
4762
4763         StringValue(s);
4764         enc = rb_enc_check(str, s);
4765         tr_setup_table(s, table,i==0, &del, &nodel, enc);
4766     }
4767
4768     s = RSTRING_PTR(str);
4769     if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
4770     send = RSTRING_END(str);
4771     i = 0;
4772     while (s < send) {
4773         int c = rb_enc_codepoint(s, send, enc);
4774         int clen = rb_enc_codelen(c, enc);
4775
4776         if (tr_find(c, table, del, nodel)) {
4777             i++;
4778         }
4779         s += clen;
4780     }
4781     return INT2NUM(i);
4782 }
4783
4784
4785 /*
4786  *  call-seq:
4787  *     str.split(pattern=$;, [limit])   => anArray
4788  *
4789  *  Divides <i>str</i> into substrings based on a delimiter, returning an array
4790  *  of these substrings.
4791  *
4792  *  If <i>pattern</i> is a <code>String</code>, then its contents are used as
4793  *  the delimiter when splitting <i>str</i>. If <i>pattern</i> is a single
4794  *  space, <i>str</i> is split on whitespace, with leading whitespace and runs
4795  *  of contiguous whitespace characters ignored.
4796  *
4797  *  If <i>pattern</i> is a <code>Regexp</code>, <i>str</i> is divided where the
4798  *  pattern matches. Whenever the pattern matches a zero-length string,
4799  *  <i>str</i> is split into individual characters. If <i>pattern</i> contains
4800  *  groups, the respective matches will be returned in the array as well.
4801  *
4802  *  If <i>pattern</i> is omitted, the value of <code>$;</code> is used.  If
4803  *  <code>$;</code> is <code>nil</code> (which is the default), <i>str</i> is
4804  *  split on whitespace as if ` ' were specified.
4805  *
4806  *  If the <i>limit</i> parameter is omitted, trailing null fields are
4807  *  suppressed. If <i>limit</i> is a positive number, at most that number of
4808  *  fields will be returned (if <i>limit</i> is <code>1</code>, the entire
4809  *  string is returned as the only entry in an array). If negative, there is no
4810  *  limit to the number of fields returned, and trailing null fields are not
4811  *  suppressed.
4812  *
4813  *     " now's  the time".split        #=> ["now's", "the", "time"]
4814  *     " now's  the time".split(' ')   #=> ["now's", "the", "time"]
4815  *     " now's  the time".split(/ /)   #=> ["", "now's", "", "the", "time"]
4816  *     "1, 2.34,56, 7".split(%r{,\s*}) #=> ["1", "2.34", "56", "7"]
4817  *     "hello".split(//)               #=> ["h", "e", "l", "l", "o"]
4818  *     "hello".split(//, 3)            #=> ["h", "e", "llo"]
4819  *     "hi mom".split(%r{\s*})         #=> ["h", "i", "m", "o", "m"]
4820  *
4821  *     "mellow yellow".split("ello")   #=> ["m", "w y", "w"]
4822  *     "1,2,,3,4,,".split(',')         #=> ["1", "2", "", "3", "4"]
4823  *     "1,2,,3,4,,".split(',', 4)      #=> ["1", "2", "", "3,4,,"]
4824  *     "1,2,,3,4,,".split(',', -4)     #=> ["1", "2", "", "3", "4", "", ""]
4825  */
4826
4827 static VALUE
4828 rb_str_split_m(int argc, VALUE *argv, VALUE str)
4829 {
4830     rb_encoding *enc;
4831     VALUE spat;
4832     VALUE limit;
4833     int awk_split = Qfalse;
4834     long beg, end, i = 0;
4835     int lim = 0;
4836     VALUE result, tmp;
4837
4838     if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
4839         lim = NUM2INT(limit);
4840         if (lim <= 0) limit = Qnil;
4841         else if (lim == 1) {
4842             if (RSTRING_LEN(str) == 0)
4843                 return rb_ary_new2(0);
4844             return rb_ary_new3(1, str);
4845         }
4846         i = 1;
4847     }
4848
4849     enc = STR_ENC_GET(str);
4850     if (NIL_P(spat)) {
4851         if (!NIL_P(rb_fs)) {
4852             spat = rb_fs;
4853             goto fs_set;
4854         }
4855         awk_split = Qtrue;
4856     }
4857     else {
4858       fs_set:
4859         if (TYPE(spat) == T_STRING) {
4860             rb_encoding *enc2 = STR_ENC_GET(spat);
4861
4862             if (rb_enc_mbminlen(enc2) == 1) {
4863                 if (RSTRING_LEN(spat) == 1 && RSTRING_PTR(spat)[0] == ' '){
4864                     awk_split = Qtrue;
4865                 }
4866             }
4867             else {
4868                 int l;
4869                 if (rb_enc_ascget(RSTRING_PTR(spat), RSTRING_END(spat), &l, enc2) == ' ' &&
4870                     RSTRING_LEN(spat) == l) {
4871                     awk_split = Qtrue;
4872                 }
4873             }
4874             if (!awk_split) {
4875                 spat = rb_reg_regcomp(rb_reg_quote(spat));
4876             }
4877         }
4878         else {
4879             spat = get_pat(spat, 1);
4880         }
4881     }
4882
4883     result = rb_ary_new();
4884     beg = 0;
4885     if (awk_split) {
4886         char *ptr = RSTRING_PTR(str);
4887         char *eptr = RSTRING_END(str);
4888         char *bptr = ptr;
4889         int skip = 1;
4890         int c;
4891
4892         end = beg;
4893         while (ptr < eptr) {
4894             c = rb_enc_codepoint(ptr, eptr, enc);
4895             ptr += rb_enc_mbclen(ptr, eptr, enc);
4896             if (skip) {
4897                 if (rb_enc_isspace(c, enc)) {
4898                     beg = ptr - bptr;
4899                 }
4900                 else {
4901                     end = ptr - bptr;
4902                     skip = 0;
4903                     if (!NIL_P(limit) && lim <= i) break;
4904                 }
4905             }
4906             else {
4907                 if (rb_enc_isspace(c, enc)) {
4908                     rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
4909                     skip = 1;
4910                     beg = ptr - bptr;
4911                     if (!NIL_P(limit)) ++i;
4912                 }
4913                 else {
4914                     end = ptr - bptr;
4915                 }
4916             }
4917         }
4918     }
4919     else {
4920         long start = beg;
4921         long idx;
4922         int last_null = 0;
4923         struct re_registers *regs;
4924
4925         while ((end = rb_reg_search(spat, str, start, 0)) >= 0) {
4926             regs = RMATCH_REGS(rb_backref_get());
4927             if (start == end && BEG(0) == END(0)) {
4928                 if (!RSTRING_PTR(str)) {
4929                     rb_ary_push(result, rb_str_new("", 0));
4930                     break;
4931                 }
4932                 else if (last_null == 1) {
4933                     rb_ary_push(result, rb_str_subseq(str, beg,
4934                                                       rb_enc_mbclen(RSTRING_PTR(str)+beg,
4935                                                                     RSTRING_END(str),
4936                                                                     enc)));
4937                     beg = start;
4938                 }
4939                 else {
4940                     if (RSTRING_PTR(str)+start == RSTRING_END(str))
4941                         start++;
4942                     else
4943                         start += rb_enc_mbclen(RSTRING_PTR(str)+start,RSTRING_END(str),enc);
4944                     last_null = 1;
4945                     continue;
4946                 }
4947             }
4948             else {
4949                 rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
4950                 beg = start = END(0);
4951             }
4952             last_null = 0;
4953
4954             for (idx=1; idx < regs->num_regs; idx++) {
4955                 if (BEG(idx) == -1) continue;
4956                 if (BEG(idx) == END(idx))
4957                     tmp = rb_str_new5(str, 0, 0);
4958                 else
4959                     tmp = rb_str_subseq(str, BEG(idx), END(idx)-BEG(idx));
4960                 rb_ary_push(result, tmp);
4961             }
4962             if (!NIL_P(limit) && lim <= ++i) break;
4963         }
4964     }
4965     if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
4966         if (RSTRING_LEN(str) == beg)
4967             tmp = rb_str_new5(str, 0, 0);
4968         else
4969             tmp = rb_str_subseq(str, beg, RSTRING_LEN(str)-beg);
4970         rb_ary_push(result, tmp);
4971     }
4972     if (NIL_P(limit) && lim == 0) {
4973         while (RARRAY_LEN(result) > 0 &&
4974                RSTRING_LEN(RARRAY_PTR(result)[RARRAY_LEN(result)-1]) == 0)
4975             rb_ary_pop(result);
4976     }
4977
4978     return result;
4979 }
4980
4981 VALUE
4982 rb_str_split(VALUE str, const char *sep0)
4983 {
4984     VALUE sep;
4985
4986     StringValue(str);
4987     sep = rb_str_new2(sep0);
4988     return rb_str_split_m(1, &sep, str);
4989 }
4990
4991
4992 /*
4993  *  Document-method: lines
4994  *  call-seq:
4995  *     str.lines(separator=$/)   => anEnumerator
4996  *     str.lines(separator=$/) {|substr| block }        => str
4997  *
4998  *  Returns an enumerator that gives each line in the string.  If a block is
4999  *  given, it iterates over each line in the string.
5000  *
5001  *     "foo\nbar\n".lines.to_a   #=> ["foo\n", "bar\n"]
5002  *     "foo\nb ar".lines.sort    #=> ["b ar", "foo\n"]
5003  */
5004
5005 /*
5006  *  Document-method: each_line
5007  *  call-seq:
5008  *     str.each_line(separator=$/) {|substr| block }   => str
5009  *
5010  *  Splits <i>str</i> using the supplied parameter as the record separator
5011  *  (<code>$/</code> by default), passing each substring in turn to the supplied
5012  *  block. If a zero-length record separator is supplied, the string is split on
5013  *  <code>\n</code> characters, except that multiple successive newlines are
5014  *  appended together.
5015  *
5016  *     print "Example one\n"
5017  *     "hello\nworld".each {|s| p s}
5018  *     print "Example two\n"
5019  *     "hello\nworld".each('l') {|s| p s}
5020  *     print "Example three\n"
5021  *     "hello\n\n\nworld".each('') {|s| p s}
5022  *
5023  *  <em>produces:</em>
5024  *
5025  *     Example one
5026  *     "hello\n"
5027  *     "world"
5028  *     Example two
5029  *     "hel"
5030  *     "l"
5031  *     "o\nworl"
5032  *     "d"
5033  *     Example three
5034  *     "hello\n\n\n"
5035  *     "world"
5036  */
5037
5038 static VALUE
5039 rb_str_each_line(int argc, VALUE *argv, VALUE str)
5040 {
5041     rb_encoding *enc;
5042     VALUE rs;
5043     int newline;
5044     char *p, *pend, *s, *ptr;
5045     long len, rslen;
5046     VALUE line;
5047     int n;
5048
5049     if (argc == 0) {
5050         rs = rb_rs;
5051     }
5052     else {
5053         rb_scan_args(argc, argv, "01", &rs);
5054     }
5055     RETURN_ENUMERATOR(str, argc, argv);
5056     if (NIL_P(rs)) {
5057         rb_yield(str);
5058         return str;
5059     }
5060     str = rb_str_new4(str);
5061     ptr = p = s = RSTRING_PTR(str);
5062     pend = p + RSTRING_LEN(str);
5063     len = RSTRING_LEN(str);
5064     StringValue(rs);
5065     if (rs == rb_default_rs) {
5066         enc = rb_enc_get(str);
5067         while (p < pend) {
5068             char *p0;
5069
5070             p = memchr(p, '\n', pend - p);
5071             if (!p) break;
5072             p0 = rb_enc_left_char_head(s, p, enc);
5073             if (!rb_enc_is_newline(p0, pend, enc)) {
5074                 p++;
5075                 continue;
5076             }
5077             p = p0 + rb_enc_mbclen(p0, pend, enc);
5078             line = rb_str_new5(str, s, p - s);
5079             OBJ_INFECT(line, str);
5080             rb_enc_cr_str_copy_for_substr(line, str);
5081             rb_yield(line);
5082             str_mod_check(str, ptr, len);
5083             s = p;
5084         }
5085         goto finish;
5086     }
5087
5088     enc = rb_enc_check(str, rs);
5089     rslen = RSTRING_LEN(rs);
5090     if (rslen == 0) {
5091         newline = '\n';
5092     }
5093     else {
5094         newline = rb_enc_codepoint(RSTRING_PTR(rs), RSTRING_END(rs), enc);
5095     }
5096
5097     while (p < pend) {
5098         int c = rb_enc_codepoint(p, pend, enc);
5099
5100         n = rb_enc_codelen(c, enc);
5101         if (rslen == 0 && c == newline) {
5102             while (p < pend && rb_enc_codepoint(p, pend, enc) == newline) {
5103                 p += n;
5104             }
5105             p -= n;
5106         }
5107         if (c == newline &&
5108             (rslen <= 1 || memcmp(RSTRING_PTR(rs), p, rslen) == 0)) {
5109             line = rb_str_new5(str, s, p - s + (rslen ? rslen : n));
5110             OBJ_INFECT(line, str);
5111             rb_enc_cr_str_copy_for_substr(line, str);
5112             rb_yield(line);
5113             str_mod_check(str, ptr, len);
5114             s = p + (rslen ? rslen : n);
5115         }
5116         p += n;
5117     }
5118
5119   finish:
5120     if (s != pend) {
5121         line = rb_str_new5(str, s, pend - s);
5122         OBJ_INFECT(line, str);
5123         rb_enc_cr_str_copy_for_substr(line, str);
5124         rb_yield(line);
5125     }
5126
5127     return str;
5128 }
5129
5130
5131 /*
5132  *  Document-method: bytes
5133  *  call-seq:
5134  *     str.bytes   => anEnumerator
5135  *     str.bytes {|fixnum| block }    => str
5136  *
5137  *  Returns an enumerator that gives each byte in the string.  If a block is
5138  *  given, it iterates over each byte in the string.
5139  *
5140  *     "hello".bytes.to_a        #=> [104, 101, 108, 108, 111]
5141  */
5142
5143 /*
5144  *  Document-method: each_byte
5145  *  call-seq:
5146  *     str.each_byte {|fixnum| block }    => str
5147  *
5148  *  Passes each byte in <i>str</i> to the given block.
5149  *
5150  *     "hello".each_byte {|c| print c, ' ' }
5151  *
5152  *  <em>produces:</em>
5153  *
5154  *     104 101 108 108 111
5155  */
5156
5157 static VALUE
5158 rb_str_each_byte(VALUE str)
5159 {
5160     long i;
5161
5162     RETURN_ENUMERATOR(str, 0, 0);
5163     for (i=0; i<RSTRING_LEN(str); i++) {
5164         rb_yield(INT2FIX(RSTRING_PTR(str)[i] & 0xff));
5165     }
5166     return str;
5167 }
5168
5169
5170 /*
5171  *  Document-method: chars
5172  *  call-seq:
5173  *     str.chars                   => anEnumerator
5174  *     str.chars {|substr| block } => str
5175  *
5176  *  Returns an enumerator that gives each character in the string.
5177  *  If a block is given, it iterates over each character in the string.
5178  *
5179  *     "foo".chars.to_a   #=> ["f","o","o"]
5180  */
5181
5182 /*
5183  *  Document-method: each_char
5184  *  call-seq:
5185  *     str.each_char {|cstr| block }    => str
5186  *
5187  *  Passes each character in <i>str</i> to the given block.
5188  *
5189  *     "hello".each_char {|c| print c, ' ' }
5190  *
5191  *  <em>produces:</em>
5192  *
5193  *     h e l l o
5194  */
5195
5196 static VALUE
5197 rb_str_each_char(VALUE str)
5198 {
5199     int i, len, n;
5200     const char *ptr;
5201     rb_encoding *enc;
5202
5203     RETURN_ENUMERATOR(str, 0, 0);
5204     str = rb_str_new4(str);
5205     ptr = RSTRING_PTR(str);
5206     len = RSTRING_LEN(str);
5207     enc = rb_enc_get(str);
5208     for (i = 0; i < len; i += n) {
5209         n = rb_enc_mbclen(ptr + i, ptr + len, enc);
5210         rb_yield(rb_str_subseq(str, i, n));
5211     }
5212     return str;
5213 }
5214
5215 static long
5216 chopped_length(VALUE str)
5217 {
5218     rb_encoding *enc = STR_ENC_GET(str);
5219     const char *p, *p2, *beg, *end;
5220
5221     beg = RSTRING_PTR(str);
5222     end = beg + RSTRING_LEN(str);
5223     if (beg > end) return 0;
5224     p = rb_enc_prev_char(beg, end, enc);
5225     if (!p) return 0;
5226     if (p > beg && rb_enc_codepoint(p, end, enc) == '\n') {
5227         p2 = rb_enc_prev_char(beg, p, enc);
5228         if (p2 && rb_enc_codepoint(p2, end, enc) == '\r') p = p2;
5229     }
5230     return p - beg;
5231 }
5232
5233 /*
5234  *  call-seq:
5235  *     str.chop!   => str or nil
5236  *
5237  *  Processes <i>str</i> as for <code>String#chop</code>, returning <i>str</i>,
5238  *  or <code>nil</code> if <i>str</i> is the empty string.  See also
5239  *  <code>String#chomp!</code>.
5240  */
5241
5242 static VALUE
5243 rb_str_chop_bang(VALUE str)
5244 {
5245     if (RSTRING_LEN(str) > 0) {
5246         long len;
5247         rb_str_modify(str);
5248         len = chopped_length(str);
5249         STR_SET_LEN(str, len);
5250         RSTRING_PTR(str)[len] = '\0';
5251         return str;
5252     }
5253     return Qnil;
5254 }
5255
5256
5257 /*
5258  *  call-seq:
5259  *     str.chop   => new_str
5260  *
5261  *  Returns a new <code>String</code> with the last character removed.  If the
5262  *  string ends with <code>\r\n</code>, both characters are removed. Applying
5263  *  <code>chop</code> to an empty string returns an empty
5264  *  string. <code>String#chomp</code> is often a safer alternative, as it leaves
5265  *  the string unchanged if it doesn't end in a record separator.
5266  *
5267  *     "string\r\n".chop   #=> "string"
5268  *     "string\n\r".chop   #=> "string\n"
5269  *     "string\n".chop     #=> "string"
5270  *     "string".chop       #=> "strin"
5271  *     "x".chop.chop       #=> ""
5272  */
5273
5274 static VALUE
5275 rb_str_chop(VALUE str)
5276 {
5277     VALUE str2 = rb_str_new5(str, RSTRING_PTR(str), chopped_length(str));
5278     rb_enc_cr_str_copy_for_substr(str2, str);
5279     OBJ_INFECT(str2, str);
5280     return str2;
5281 }
5282
5283
5284 /*
5285  *  call-seq:
5286  *     str.chomp!(separator=$/)   => str or nil
5287  *
5288  *  Modifies <i>str</i> in place as described for <code>String#chomp</code>,
5289  *  returning <i>str</i>, or <code>nil</code> if no modifications were made.
5290  */
5291
5292 static VALUE
5293 rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
5294 {
5295     rb_encoding *enc;
5296     VALUE rs;
5297     int newline;
5298     char *p, *pp, *e;
5299     long len, rslen;
5300
5301     len = RSTRING_LEN(str);
5302     if (len == 0) return Qnil;
5303     p = RSTRING_PTR(str);
5304     e = p + len;
5305     if (argc == 0) {
5306         rs = rb_rs;
5307         if (rs == rb_default_rs) {
5308           smart_chomp:
5309             rb_str_modify(str);
5310             enc = rb_enc_get(str);
5311             if (rb_enc_mbminlen(enc) > 1) {
5312                 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), enc);
5313                 if (rb_enc_is_newline(pp, e, enc)) {
5314                     e = pp;
5315                 }
5316                 pp = e - rb_enc_mbminlen(enc);
5317                 if (pp >= p) {
5318                     pp = rb_enc_left_char_head(p, pp, enc);
5319                     if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
5320                         e = pp;
5321                     }
5322                 }
5323                 if (e == RSTRING_END(str)) {
5324                     return Qnil;
5325                 }
5326                 len = e - RSTRING_PTR(str);
5327                 STR_SET_LEN(str, len);
5328             }
5329             else {
5330                 if (RSTRING_PTR(str)[len-1] == '\n') {
5331                     STR_DEC_LEN(str);
5332                     if (RSTRING_LEN(str) > 0 &&
5333                         RSTRING_PTR(str)[RSTRING_LEN(str)-1] == '\r') {
5334                         STR_DEC_LEN(str);
5335                     }
5336                 }
5337                 else if (RSTRING_PTR(str)[len-1] == '\r') {
5338                     STR_DEC_LEN(str);
5339                 }
5340                 else {
5341                     return Qnil;
5342                 }
5343             }
5344             RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
5345             return str;
5346         }
5347     }
5348     else {
5349         rb_scan_args(argc, argv, "01", &rs);
5350     }
5351     if (NIL_P(rs)) return Qnil;
5352     StringValue(rs);
5353     rslen = RSTRING_LEN(rs);
5354     if (rslen == 0) {
5355         while (len>0 && p[len-1] == '\n') {
5356             len--;
5357             if (len>0 && p[len-1] == '\r')
5358                 len--;
5359         }
5360         if (len < RSTRING_LEN(str)) {
5361             rb_str_modify(str);
5362             STR_SET_LEN(str, len);
5363             RSTRING_PTR(str)[len] = '\0';
5364             return str;
5365         }
5366         return Qnil;
5367     }
5368     if (rslen > len) return Qnil;
5369     newline = RSTRING_PTR(rs)[rslen-1];
5370     if (rslen == 1 && newline == '\n')
5371         goto smart_chomp;
5372
5373     enc = rb_enc_check(str, rs);
5374     if (is_broken_string(rs)) {
5375         return Qnil;
5376     }
5377     pp = e - rslen;
5378     if (p[len-1] == newline &&
5379         (rslen <= 1 ||
5380          memcmp(RSTRING_PTR(rs), pp, rslen) == 0)) {
5381         if (rb_enc_left_char_head(p, pp, enc) != pp)
5382             return Qnil;
5383         rb_str_modify(str);
5384         STR_SET_LEN(str, RSTRING_LEN(str) - rslen);
5385         RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
5386         return str;
5387     }
5388     return Qnil;
5389 }
5390
5391
5392 /*
5393  *  call-seq:
5394  *     str.chomp(separator=$/)   => new_str
5395  *
5396  *  Returns a new <code>String</code> with the given record separator removed
5397  *  from the end of <i>str</i> (if present). If <code>$/</code> has not been
5398  *  changed from the default Ruby record separator, then <code>chomp</code> also
5399  *  removes carriage return characters (that is it will remove <code>\n</code>,
5400  *  <code>\r</code>, and <code>\r\n</code>).
5401  *
5402  *     "hello".chomp            #=> "hello"
5403  *     "hello\n".chomp          #=> "hello"
5404  *     "hello\r\n".chomp        #=> "hello"
5405  *     "hello\n\r".chomp        #=> "hello\n"
5406  *     "hello\r".chomp          #=> "hello"
5407  *     "hello \n there".chomp   #=> "hello \n there"
5408  *     "hello".chomp("llo")     #=> "he"
5409  */
5410
5411 static VALUE
5412 rb_str_chomp(int argc, VALUE *argv, VALUE str)
5413 {
5414     str = rb_str_dup(str);
5415     rb_str_chomp_bang(argc, argv, str);
5416     return str;
5417 }
5418
5419 /*
5420  *  call-seq:
5421  *     str.lstrip!   => self or nil
5422  *
5423  *  Removes leading whitespace from <i>str</i>, returning <code>nil</code> if no
5424  *  change was made. See also <code>String#rstrip!</code> and
5425  *  <code>String#strip!</code>.
5426  *
5427  *     "  hello  ".lstrip   #=> "hello  "
5428  *     "hello".lstrip!      #=> nil
5429  */
5430
5431 static VALUE
5432 rb_str_lstrip_bang(VALUE str)
5433 {
5434     rb_encoding *enc;
5435     char *s, *t, *e;
5436
5437     rb_str_modify(str);
5438     enc = STR_ENC_GET(str);
5439     s = RSTRING_PTR(str);
5440     if (!s || RSTRING_LEN(str) == 0) return Qnil;
5441     e = t = RSTRING_END(str);
5442     /* remove spaces at head */
5443     while (s < e) {
5444         int cc = rb_enc_codepoint(s, e, enc);
5445
5446         if (!rb_enc_isspace(cc, enc)) break;
5447         s += rb_enc_codelen(cc, enc);
5448     }
5449
5450     if (s > RSTRING_PTR(str)) {
5451         rb_str_modify(str);
5452         STR_SET_LEN(str, t-s);
5453         memmove(RSTRING_PTR(str), s, RSTRING_LEN(str));
5454         RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
5455         return str;
5456     }
5457     return Qnil;
5458 }
5459
5460
5461 /*
5462  *  call-seq:
5463  *     str.lstrip   => new_str
5464  *
5465  *  Returns a copy of <i>str</i> with leading whitespace removed. See also
5466  *  <code>String#rstrip</code> and <code>String#strip</code>.
5467  *
5468  *     "  hello  ".lstrip   #=> "hello  "
5469  *     "hello".lstrip       #=> "hello"
5470  */
5471
5472 static VALUE
5473 rb_str_lstrip(VALUE str)
5474 {
5475     str = rb_str_dup(str);
5476     rb_str_lstrip_bang(str);
5477     return str;
5478 }
5479
5480
5481 /*
5482  *  call-seq:
5483  *     str.rstrip!   => self or nil
5484  *
5485  *  Removes trailing whitespace from <i>str</i>, returning <code>nil</code> if
5486  *  no change was made. See also <code>String#lstrip!</code> and
5487  *  <code>String#strip!</code>.
5488  *
5489  *     "  hello  ".rstrip   #=> "  hello"
5490  *     "hello".rstrip!      #=> nil
5491  */
5492
5493 static VALUE
5494 rb_str_rstrip_bang(VALUE str)
5495 {
5496     rb_encoding *enc;
5497     char *s, *t, *e;
5498     int space_seen = Qfalse;
5499
5500     rb_str_modify(str);
5501     enc = STR_ENC_GET(str);
5502     s = RSTRING_PTR(str);
5503     if (!s || RSTRING_LEN(str) == 0) return Qnil;
5504     t = e = RSTRING_END(str);
5505     while (s < e) {
5506         int cc = rb_enc_codepoint(s, e, enc);
5507
5508         if (!cc || rb_enc_isspace(cc, enc)) {
5509             if (!space_seen) t = s;
5510             space_seen = Qtrue;
5511         }
5512         else {
5513             space_seen = Qfalse;
5514         }
5515         s += rb_enc_codelen(cc, enc);
5516     }
5517     if (!space_seen) t = s;
5518     if (t < e) {
5519         rb_str_modify(str);
5520         STR_SET_LEN(str, t-RSTRING_PTR(str));
5521         RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
5522         return str;
5523     }
5524     return Qnil;
5525 }
5526
5527
5528 /*
5529  *  call-seq:
5530  *     str.rstrip   => new_str
5531  *
5532  *  Returns a copy of <i>str</i> with trailing whitespace removed. See also
5533  *  <code>String#lstrip</code> and <code>String#strip</code>.
5534  *
5535  *     "  hello  ".rstrip   #=> "  hello"
5536  *     "hello".rstrip       #=> "hello"
5537  */
5538
5539 static VALUE
5540 rb_str_rstrip(VALUE str)
5541 {
5542     str = rb_str_dup(str);
5543     rb_str_rstrip_bang(str);
5544     return str;
5545 }
5546
5547
5548 /*
5549  *  call-seq:
5550  *     str.strip!   => str or nil
5551  *
5552  *  Removes leading and trailing whitespace from <i>str</i>. Returns
5553  *  <code>nil</code> if <i>str</i> was not altered.
5554  */
5555
5556 static VALUE
5557 rb_str_strip_bang(VALUE str)
5558 {
5559     VALUE l = rb_str_lstrip_bang(str);
5560     VALUE r = rb_str_rstrip_bang(str);
5561
5562     if (NIL_P(l) && NIL_P(r)) return Qnil;
5563     return str;
5564 }
5565
5566
5567 /*
5568  *  call-seq:
5569  *     str.strip   => new_str
5570  *
5571  *  Returns a copy of <i>str</i> with leading and trailing whitespace removed.
5572  *
5573  *     "    hello    ".strip   #=> "hello"
5574  *     "\tgoodbye\r\n".strip   #=> "goodbye"
5575  */
5576
5577 static VALUE
5578 rb_str_strip(VALUE str)
5579 {
5580     str = rb_str_dup(str);
5581     rb_str_strip_bang(str);
5582     return str;
5583 }
5584
5585 static VALUE
5586 scan_once(VALUE str, VALUE pat, long *start)
5587 {
5588     rb_encoding *enc;
5589     VALUE result, match;
5590     struct re_registers *regs;
5591     long i;
5592
5593     enc = STR_ENC_GET(str);
5594     if (rb_reg_search(pat, str, *start, 0) >= 0) {
5595         match = rb_backref_get();
5596         regs = RMATCH_REGS(match);
5597         if (BEG(0) == END(0)) {
5598             /*
5599              * Always consume at least one character of the input string
5600              */
5601             if (RSTRING_LEN(str) > END(0))
5602                 *start = END(0)+rb_enc_mbclen(RSTRING_PTR(str)+END(0),
5603                                               RSTRING_END(str), enc);
5604             else
5605                 *start = END(0)+1;
5606         }
5607         else {
5608             *start = END(0);
5609         }
5610         if (regs->num_regs == 1) {
5611             return rb_reg_nth_match(0, match);
5612         }
5613         result = rb_ary_new2(regs->num_regs);
5614         for (i=1; i < regs->num_regs; i++) {
5615             rb_ary_push(result, rb_reg_nth_match(i, match));
5616         }
5617
5618         return result;
5619     }
5620     return Qnil;
5621 }
5622
5623
5624 /*
5625  *  call-seq:
5626  *     str.scan(pattern)                         => array
5627  *     str.scan(pattern) {|match, ...| block }   => str
5628  *
5629  *  Both forms iterate through <i>str</i>, matching the pattern (which may be a
5630  *  <code>Regexp</code> or a <code>String</code>). For each match, a result is
5631  *  generated and either added to the result array or passed to the block. If
5632  *  the pattern contains no groups, each individual result consists of the
5633  *  matched string, <code>$&</code>.  If the pattern contains groups, each
5634  *  individual result is itself an array containing one entry per group.
5635  *
5636  *     a = "cruel world"
5637  *     a.scan(/\w+/)        #=> ["cruel", "world"]
5638  *     a.scan(/.../)        #=> ["cru", "el ", "wor"]
5639  *     a.scan(/(...)/)      #=> [["cru"], ["el "], ["wor"]]
5640  *     a.scan(/(..)(..)/)   #=> [["cr", "ue"], ["l ", "wo"]]
5641  *
5642  *  And the block form:
5643  *
5644  *     a.scan(/\w+/) {|w| print "<<#{w}>> " }
5645  *     print "\n"
5646  *     a.scan(/(.)(.)/) {|x,y| print y, x }
5647  *     print "\n"
5648  *
5649  *  <em>produces:</em>
5650  *
5651  *     <<cruel>> <<world>>
5652  *     rceu lowlr
5653  */
5654
5655 static VALUE
5656 rb_str_scan(VALUE str, VALUE pat)
5657 {
5658     VALUE result;
5659     long start = 0;
5660     VALUE match = Qnil;
5661     char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
5662
5663     pat = get_pat(pat, 1);
5664     if (!rb_block_given_p()) {
5665         VALUE ary = rb_ary_new();
5666
5667         while (!NIL_P(result = scan_once(str, pat, &start))) {
5668             match = rb_backref_get();
5669             rb_ary_push(ary, result);
5670         }
5671         rb_backref_set(match);
5672         return ary;
5673     }
5674
5675     while (!NIL_P(result = scan_once(str, pat, &start))) {
5676         match = rb_backref_get();
5677         rb_match_busy(match);
5678         rb_yield(result);
5679         str_mod_check(str, p, len);
5680         rb_backref_set(match);  /* restore $~ value */
5681     }
5682     rb_backref_set(match);
5683     return str;
5684 }
5685
5686
5687 /*
5688  *  call-seq:
5689  *     str.hex   => integer
5690  *
5691  *  Treats leading characters from <i>str</i> as a string of hexadecimal digits
5692  *  (with an optional sign and an optional <code>0x</code>) and returns the
5693  *  corresponding number. Zero is returned on error.
5694  *
5695  *     "0x0a".hex     #=> 10
5696  *     "-1234".hex    #=> -4660
5697  *     "0".hex        #=> 0
5698  *     "wombat".hex   #=> 0
5699  */
5700
5701 static VALUE
5702 rb_str_hex(VALUE str)
5703 {
5704     rb_encoding *enc = rb_enc_get(str);
5705
5706     if (!rb_enc_asciicompat(enc)) {
5707         rb_raise(rb_eArgError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
5708     }
5709     return rb_str_to_inum(str, 16, Qfalse);
5710 }
5711
5712
5713 /*
5714  *  call-seq:
5715  *     str.oct   => integer
5716  *
5717  *  Treats leading characters of <i>str</i> as a string of octal digits (with an
5718  *  optional sign) and returns the corresponding number.  Returns 0 if the
5719  *  conversion fails.
5720  *
5721  *     "123".oct       #=> 83
5722  *     "-377".oct      #=> -255
5723  *     "bad".oct       #=> 0
5724  *     "0377bad".oct   #=> 255
5725  */
5726
5727 static VALUE
5728 rb_str_oct(VALUE str)
5729 {
5730     rb_encoding *enc = rb_enc_get(str);
5731
5732     if (!rb_enc_asciicompat(enc)) {
5733         rb_raise(rb_eArgError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
5734     }
5735     return rb_str_to_inum(str, -8, Qfalse);
5736 }
5737
5738
5739 /*
5740  *  call-seq:
5741  *     str.crypt(other_str)   => new_str
5742  *
5743  *  Applies a one-way cryptographic hash to <i>str</i> by invoking the standard
5744  *  library function <code>crypt</code>. The argument is the salt string, which
5745  *  should be two characters long, each character drawn from
5746  *  <code>[a-zA-Z0-9./]</code>.
5747  */
5748
5749 static VALUE
5750 rb_str_crypt(VALUE str, VALUE salt)
5751 {
5752     extern char *crypt(const char *, const char *);
5753     VALUE result;
5754     const char *s;
5755
5756     StringValue(salt);
5757     if (RSTRING_LEN(salt) < 2)
5758         rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
5759
5760     if (RSTRING_PTR(str)) s = RSTRING_PTR(str);
5761     else s = "";
5762     result = rb_str_new2(crypt(s, RSTRING_PTR(salt)));
5763     OBJ_INFECT(result, str);
5764     OBJ_INFECT(result, salt);
5765     return result;
5766 }
5767
5768
5769 /*
5770  *  call-seq:
5771  *     str.intern   => symbol
5772  *     str.to_sym   => symbol
5773  *
5774  *  Returns the <code>Symbol</code> corresponding to <i>str</i>, creating the
5775  *  symbol if it did not previously exist. See <code>Symbol#id2name</code>.
5776  *
5777  *     "Koala".intern         #=> :Koala
5778  *     s = 'cat'.to_sym       #=> :cat
5779  *     s == :cat              #=> true
5780  *     s = '@cat'.to_sym      #=> :@cat
5781  *     s == :@cat             #=> true
5782  *
5783  *  This can also be used to create symbols that cannot be represented using the
5784  *  <code>:xxx</code> notation.
5785  *
5786  *     'cat and dog'.to_sym   #=> :"cat and dog"
5787  */
5788
5789 VALUE
5790 rb_str_intern(VALUE s)
5791 {
5792     VALUE str = RB_GC_GUARD(s);
5793     ID id;
5794
5795     id = rb_intern_str(str);
5796     return ID2SYM(id);
5797 }
5798
5799
5800 /*
5801  *  call-seq:
5802  *     str.ord   => integer
5803  *
5804  *  Return the <code>Integer</code> ordinal of a one-character string.
5805  *
5806  *     "a".ord         #=> 97
5807  */
5808
5809 VALUE
5810 rb_str_ord(VALUE s)
5811 {
5812     int c;
5813
5814     c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s));
5815     return INT2NUM(c);
5816 }
5817 /*
5818  *  call-seq:
5819  *     str.sum(n=16)   => integer
5820  *
5821  *  Returns a basic <em>n</em>-bit checksum of the characters in <i>str</i>,
5822  *  where <em>n</em> is the optional <code>Fixnum</code> parameter, defaulting
5823  *  to 16. The result is simply the sum of the binary value of each character in
5824  *  <i>str</i> modulo <code>2n - 1</code>. This is not a particularly good
5825  *  checksum.
5826  */
5827
5828 static VALUE
5829 rb_str_sum(int argc, VALUE *argv, VALUE str)
5830 {
5831     VALUE vbits;
5832     int bits;
5833     char *ptr, *p, *pend;
5834     long len;
5835
5836     if (argc == 0) {
5837         bits = 16;
5838     }
5839     else {
5840         rb_scan_args(argc, argv, "01", &vbits);
5841         bits = NUM2INT(vbits);
5842     }
5843     ptr = p = RSTRING_PTR(str);
5844     len = RSTRING_LEN(str);
5845     pend = p + len;
5846     if (bits >= sizeof(long)*CHAR_BIT) {
5847         VALUE sum = INT2FIX(0);
5848
5849         while (p < pend) {
5850             str_mod_check(str, ptr, len);
5851             sum = rb_funcall(sum, '+', 1, INT2FIX((unsigned char)*p));
5852             p++;
5853         }
5854         if (bits != 0) {
5855             VALUE mod;
5856
5857             mod = rb_funcall(INT2FIX(1), rb_intern("<<"), 1, INT2FIX(bits));
5858             mod = rb_funcall(mod, '-', 1, INT2FIX(1));
5859             sum = rb_funcall(sum, '&', 1, mod);
5860         }
5861         return sum;
5862     }
5863     else {
5864        unsigned long sum = 0;
5865
5866         while (p < pend) {
5867             str_mod_check(str, ptr, len);
5868             sum += (unsigned char)*p;
5869             p++;
5870         }
5871         if (bits != 0) {
5872            sum &= (((unsigned long)1)<<bits)-1;
5873         }
5874         return rb_int2inum(sum);
5875     }
5876 }
5877
5878 static VALUE
5879 rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
5880 {
5881     rb_encoding *enc;
5882     VALUE w;
5883     long width, len, flen = 1, fclen = 1;
5884     VALUE res;
5885     char *p, *f = " ";
5886     long n, llen, rlen;
5887     volatile VALUE pad;
5888     int singlebyte = 1;
5889
5890     rb_scan_args(argc, argv, "11", &w, &pad);
5891     enc = STR_ENC_GET(str);
5892     width = NUM2LONG(w);
5893     if (argc == 2) {
5894         StringValue(pad);
5895         enc = rb_enc_check(str, pad);
5896         f = RSTRING_PTR(pad);
5897         flen = RSTRING_LEN(pad);
5898         fclen = str_strlen(pad, enc);
5899         singlebyte = single_byte_optimizable(pad);
5900         if (flen == 0 || fclen == 0) {
5901             rb_raise(rb_eArgError, "zero width padding");
5902         }
5903     }
5904     len = str_strlen(str, enc);
5905     if (width < 0 || len >= width) return rb_str_dup(str);
5906     n = width - len;
5907     llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
5908     rlen = n - llen;
5909     res = rb_str_new5(str, 0, RSTRING_LEN(str)+n*flen/fclen+2);
5910     p = RSTRING_PTR(res);
5911     while (llen) {
5912         if (flen <= 1) {
5913             *p++ = *f;
5914             llen--;
5915         }
5916         else if (llen > fclen) {
5917             memcpy(p,f,flen);
5918             p += flen;
5919             llen -= fclen;
5920         }
5921         else {
5922             char *fp = str_nth(f, f+flen, llen, enc, singlebyte);
5923             n = fp - f;
5924             memcpy(p,f,n);
5925             p+=n;
5926             break;
5927         }
5928     }
5929     memcpy(p, RSTRING_PTR(str), RSTRING_LEN(str));
5930     p+=RSTRING_LEN(str);
5931     while (rlen) {
5932         if (flen <= 1) {
5933             *p++ = *f;
5934             rlen--;
5935         }
5936         else if (rlen > fclen) {
5937             memcpy(p,f,flen);
5938             p += flen;
5939             rlen -= fclen;
5940         }
5941         else {
5942             char *fp = str_nth(f, f+flen, rlen, enc, singlebyte);
5943             n = fp - f;
5944             memcpy(p,f,n);
5945             p+=n;
5946             break;
5947         }
5948     }
5949     *p = '\0';
5950     STR_SET_LEN(res, p-RSTRING_PTR(res));
5951     OBJ_INFECT(res, str);
5952     if (!NIL_P(pad)) OBJ_INFECT(res, pad);
5953     rb_enc_associate(res, enc);
5954     return res;
5955 }
5956
5957
5958 /*
5959  *  call-seq:
5960  *     str.ljust(integer, padstr=' ')   => new_str
5961  *
5962  *  If <i>integer</i> is greater than the length of <i>str</i>, returns a new
5963  *  <code>String</code> of length <i>integer</i> with <i>str</i> left justified
5964  *  and padded with <i>padstr</i>; otherwise, returns <i>str</i>.
5965  *
5966  *     "hello".ljust(4)            #=> "hello"
5967  *     "hello".ljust(20)           #=> "hello               "
5968  *     "hello".ljust(20, '1234')   #=> "hello123412341234123"
5969  */
5970
5971 static VALUE
5972 rb_str_ljust(int argc, VALUE *argv, VALUE str)
5973 {
5974     return rb_str_justify(argc, argv, str, 'l');
5975 }
5976
5977
5978 /*
5979  *  call-seq:
5980  *     str.rjust(integer, padstr=' ')   => new_str
5981  *
5982  *  If <i>integer</i> is greater than the length of <i>str</i>, returns a new
5983  *  <code>String</code> of length <i>integer</i> with <i>str</i> right justified
5984  *  and padded with <i>padstr</i>; otherwise, returns <i>str</i>.
5985  *
5986  *     "hello".rjust(4)            #=> "hello"
5987  *     "hello".rjust(20)           #=> "               hello"
5988  *     "hello".rjust(20, '1234')   #=> "123412341234123hello"
5989  */
5990
5991 static VALUE
5992 rb_str_rjust(int argc, VALUE *argv, VALUE str)
5993 {
5994     return rb_str_justify(argc, argv, str, 'r');
5995 }
5996
5997
5998 /*
5999  *  call-seq:
6000  *     str.center(integer, padstr)   => new_str
6001  *
6002  *  If <i>integer</i> is greater than the length of <i>str</i>, returns a new
6003  *  <code>String</code> of length <i>integer</i> with <i>str</i> centered and
6004  *  padded with <i>padstr</i>; otherwise, returns <i>str</i>.
6005  *
6006  *     "hello".center(4)         #=> "hello"
6007  *     "hello".center(20)        #=> "       hello        "
6008  *     "hello".center(20, '123') #=> "1231231hello12312312"
6009  */
6010
6011 static VALUE
6012 rb_str_center(int argc, VALUE *argv, VALUE str)
6013 {
6014     return rb_str_justify(argc, argv, str, 'c');
6015 }
6016
6017 /*
6018  *  call-seq:
6019  *     str.partition(sep)              => [head, sep, tail]
6020  *
6021  *  Searches the string for <i>sep</i> and returns the part before
6022  *  it, the <i>sep</i>, and the part after it.  If <i>sep</i> is not found,
6023  *  returns <i>str</i> and two empty strings.
6024  *
6025  *     "hello".partition("l")         #=> ["he", "l", "lo"]
6026  *     "hello".partition("x")         #=> ["hello", "", ""]
6027  */
6028
6029 static VALUE
6030 rb_str_partition(VALUE str, VALUE sep)
6031 {
6032     long pos;
6033     int regex = Qfalse;
6034
6035     if (TYPE(sep) == T_REGEXP) {
6036         pos = rb_reg_search(sep, str, 0, 0);
6037         regex = Qtrue;
6038     }
6039     else {
6040         VALUE tmp;
6041
6042         tmp = rb_check_string_type(sep);
6043         if (NIL_P(tmp)) {
6044             rb_raise(rb_eTypeError, "type mismatch: %s given",
6045                      rb_obj_classname(sep));
6046         }
6047         pos = rb_str_index(str, sep, 0);
6048     }
6049     if (pos < 0) {
6050       failed:
6051         return rb_ary_new3(3, str, rb_str_new(0,0),rb_str_new(0,0));
6052     }
6053     if (regex) {
6054         sep = rb_str_subpat(str, sep, 0);
6055         if (pos == 0 && RSTRING_LEN(sep) == 0) goto failed;
6056     }
6057     return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
6058                           sep,
6059                           rb_str_subseq(str, pos+RSTRING_LEN(sep),
6060                                              RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
6061 }
6062
6063 /*
6064  *  call-seq:
6065  *     str.rpartition(sep)            => [head, sep, tail]
6066  *
6067  *  Searches <i>sep</i> in the string from the end of the string, and
6068  *  returns the part before it, the <i>sep</i>, and the part after it.
6069  *  If <i>sep</i> is not found, returns two empty strings and
6070  *  <i>str</i>.
6071  *
6072  *     "hello".rpartition("l")         #=> ["hel", "l", "o"]
6073  *     "hello".rpartition("x")         #=> ["", "", "hello"]
6074  */
6075
6076 static VALUE
6077 rb_str_rpartition(VALUE str, VALUE sep)
6078 {
6079     long pos = RSTRING_LEN(str);
6080     int regex = Qfalse;
6081
6082     if (TYPE(sep) == T_REGEXP) {
6083         pos = rb_reg_search(sep, str, pos, 1);
6084         regex = Qtrue;
6085     }
6086     else {
6087         VALUE tmp;
6088
6089         tmp = rb_check_string_type(sep);
6090         if (NIL_P(tmp)) {
6091             rb_raise(rb_eTypeError, "type mismatch: %s given",
6092                      rb_obj_classname(sep));
6093         }
6094         pos = rb_str_sublen(str, pos);
6095         pos = rb_str_rindex(str, sep, pos);
6096     }
6097     if (pos < 0) {
6098         return rb_ary_new3(3, rb_str_new(0,0),rb_str_new(0,0), str);
6099     }
6100     if (regex) {
6101         sep = rb_reg_nth_match(0, rb_backref_get());
6102     }
6103     return rb_ary_new3(3, rb_str_substr(str, 0, pos),
6104                           sep,
6105                           rb_str_substr(str,pos+str_strlen(sep,STR_ENC_GET(sep)),RSTRING_LEN(str)));
6106 }
6107
6108 /*
6109  *  call-seq:
6110  *     str.start_with?([prefix]+)   => true or false
6111  *
6112  *  Returns true if <i>str</i> starts with the prefix given.
6113  */
6114
6115 static VALUE
6116 rb_str_start_with(int argc, VALUE *argv, VALUE str)
6117 {
6118     int i;
6119
6120     for (i=0; i<argc; i++) {
6121         VALUE tmp = rb_check_string_type(argv[i]);
6122         if (NIL_P(tmp)) continue;
6123         rb_enc_check(str, tmp);
6124         if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
6125         if (memcmp(RSTRING_PTR(str), RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
6126             return Qtrue;
6127     }
6128     return Qfalse;
6129 }
6130
6131 /*
6132  *  call-seq:
6133  *     str.end_with?([suffix]+)   => true or false
6134  *
6135  *  Returns true if <i>str</i> ends with the suffix given.
6136  */
6137
6138 static VALUE
6139 rb_str_end_with(int argc, VALUE *argv, VALUE str)
6140 {
6141     int i;
6142     char *p, *s;
6143     rb_encoding *enc;
6144
6145     for (i=0; i<argc; i++) {
6146         VALUE tmp = rb_check_string_type(argv[i]);
6147         if (NIL_P(tmp)) continue;
6148         enc = rb_enc_check(str, tmp);
6149         if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
6150         p = RSTRING_PTR(str);
6151         s = p + RSTRING_LEN(str) - RSTRING_LEN(tmp);
6152         if (rb_enc_left_char_head(p, s, enc) != s)
6153             continue;
6154         if (memcmp(s, RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
6155             return Qtrue;
6156     }
6157     return Qfalse;
6158 }
6159
6160 void
6161 rb_str_setter(VALUE val, ID id, VALUE *var)
6162 {
6163     if (!NIL_P(val) && TYPE(val) != T_STRING) {
6164         rb_raise(rb_eTypeError, "value of %s must be String", rb_id2name(id));
6165     }
6166     *var = val;
6167 }
6168
6169
6170 /*
6171  *  call-seq:
6172  *     str.force_encoding(encoding)   => str
6173  *
6174  *  Changes the encoding to +encoding+ and returns self.
6175  */
6176
6177 static VALUE
6178 rb_str_force_encoding(VALUE str, VALUE enc)
6179 {
6180     str_modifiable(str);
6181     rb_enc_associate(str, rb_to_encoding(enc));
6182     return str;
6183 }
6184
6185 /*
6186  *  call-seq:
6187  *     str.valid_encoding?  => true or false
6188  *
6189  *  Returns true for a string which encoded correctly.
6190  *
6191  *    "\xc2\xa1".force_encoding("UTF-8").valid_encoding? => true
6192  *    "\xc2".force_encoding("UTF-8").valid_encoding? => false
6193  *    "\x80".force_encoding("UTF-8").valid_encoding? => false
6194  */
6195
6196 static VALUE
6197 rb_str_valid_encoding_p(VALUE str)
6198 {
6199     int cr = rb_enc_str_coderange(str);
6200
6201     return cr == ENC_CODERANGE_BROKEN ? Qfalse : Qtrue;
6202 }
6203
6204 /*
6205  *  call-seq:
6206  *     str.ascii_only?  => true or false
6207  *
6208  *  Returns true for a string which has only ASCII characters.
6209  *
6210  *    "abc".force_encoding("UTF-8").ascii_only? => true
6211  *    "abc\u{6666}".force_encoding("UTF-8").ascii_only? => false
6212  */
6213
6214 static VALUE
6215 rb_str_is_ascii_only_p(VALUE str)
6216 {
6217     int cr = rb_enc_str_coderange(str);
6218
6219     return cr == ENC_CODERANGE_7BIT ? Qtrue : Qfalse;
6220 }
6221
6222 /**********************************************************************
6223  * Document-class: Symbol
6224  *
6225  *  <code>Symbol</code> objects represent names and some strings
6226  *  inside the Ruby
6227  *  interpreter. They are generated using the <code>:name</code> and
6228  *  <code>:"string"</code> literals
6229  *  syntax, and by the various <code>to_sym</code> methods. The same
6230  *  <code>Symbol</code> object will be created for a given name or string
6231  *  for the duration of a program's execution, regardless of the context
6232  *  or meaning of that name. Thus if <code>Fred</code> is a constant in
6233  *  one context, a method in another, and a class in a third, the
6234  *  <code>Symbol</code> <code>:Fred</code> will be the same object in
6235  *  all three contexts.
6236  *
6237  *     module One
6238  *       class Fred
6239  *       end
6240  *       $f1 = :Fred
6241  *     end
6242  *     module Two
6243  *       Fred = 1
6244  *       $f2 = :Fred
6245  *     end
6246  *     def Fred()
6247  *     end
6248  *     $f3 = :Fred
6249  *     $f1.object_id   #=> 2514190
6250  *     $f2.object_id   #=> 2514190
6251  *     $f3.object_id   #=> 2514190
6252  *
6253  */
6254
6255
6256 /*
6257  *  call-seq:
6258  *     sym == obj   => true or false
6259  *
6260  *  Equality---If <i>sym</i> and <i>obj</i> are exactly the same
6261  *  symbol, returns <code>true</code>. Otherwise, compares them
6262  *  as strings.
6263  */
6264
6265 static VALUE
6266 sym_equal(VALUE sym1, VALUE sym2)
6267 {
6268     if (sym1 == sym2) return Qtrue;
6269     return Qfalse;
6270 }
6271
6272
6273 /*
6274  *  call-seq:
6275  *     sym.to_i      => fixnum
6276  *
6277  *  Returns an integer that is unique for each symbol within a
6278  *  particular execution of a program.
6279  *
6280  *     :fred.to_i           #=> 9809
6281  *     "fred".to_sym.to_i   #=> 9809
6282  */
6283
6284 static VALUE
6285 sym_to_i(VALUE sym)
6286 {
6287     ID id = SYM2ID(sym);
6288
6289     return LONG2FIX(id);
6290 }
6291
6292
6293 /*
6294  *  call-seq:
6295  *     sym.inspect    => string
6296  *
6297  *  Returns the representation of <i>sym</i> as a symbol literal.
6298  *
6299  *     :fred.inspect   #=> ":fred"
6300  */
6301
6302 static VALUE
6303 sym_inspect(VALUE sym)
6304 {
6305     VALUE str, klass = Qundef;
6306     ID id = SYM2ID(sym);
6307     rb_encoding *enc;
6308
6309     sym = rb_id2str(id);
6310     enc = STR_ENC_GET(sym);
6311     str = rb_enc_str_new(0, RSTRING_LEN(sym)+1, enc);
6312     RSTRING_PTR(str)[0] = ':';
6313     memcpy(RSTRING_PTR(str)+1, RSTRING_PTR(sym), RSTRING_LEN(sym));
6314     if (RSTRING_LEN(sym) != strlen(RSTRING_PTR(sym)) ||
6315         !rb_enc_symname_p(RSTRING_PTR(sym), enc)) {
6316         str = rb_str_inspect(str);
6317         strncpy(RSTRING_PTR(str), ":\"", 2);
6318     }
6319     if (klass != Qundef) {
6320         rb_str_cat2(str, "/");
6321         rb_str_append(str, rb_inspect(klass));
6322     }
6323     return str;
6324 }
6325
6326
6327 /*
6328  *  call-seq:
6329  *     sym.id2name   => string
6330  *     sym.to_s      => string
6331  *
6332  *  Returns the name or string corresponding to <i>sym</i>.
6333  *
6334  *     :fred.id2name   #=> "fred"
6335  */
6336
6337
6338 VALUE
6339 rb_sym_to_s(VALUE sym)
6340 {
6341     ID id = SYM2ID(sym);
6342
6343     return str_new3(rb_cString, rb_id2str(id));
6344 }
6345
6346
6347 /*
6348  * call-seq:
6349  *   sym.to_sym   => sym
6350  *   sym.intern   => sym
6351  *
6352  * In general, <code>to_sym</code> returns the <code>Symbol</code> corresponding
6353  * to an object. As <i>sym</i> is already a symbol, <code>self</code> is returned
6354  * in this case.
6355  */
6356
6357 static VALUE
6358 sym_to_sym(VALUE sym)
6359 {
6360     return sym;
6361 }
6362
6363 static VALUE
6364 sym_call(VALUE args, VALUE sym, int argc, VALUE *argv)
6365 {
6366     VALUE obj;
6367
6368     if (argc < 1) {
6369         rb_raise(rb_eArgError, "no receiver given");
6370     }
6371     obj = argv[0];
6372     return rb_funcall3(obj, (ID)sym, argc - 1, argv + 1);
6373 }
6374
6375 /*
6376  * call-seq:
6377  *   sym.to_proc
6378  *
6379  * Returns a _Proc_ object which respond to the given method by _sym_.
6380  *
6381  *   (1..3).collect(&:to_s)  #=> ["1", "2", "3"]
6382  */
6383
6384 static VALUE
6385 sym_to_proc(VALUE sym)
6386 {
6387     return rb_proc_new(sym_call, (VALUE)SYM2ID(sym));
6388 }
6389
6390
6391 static VALUE
6392 sym_succ(VALUE sym)
6393 {
6394     return rb_str_intern(rb_str_succ(rb_sym_to_s(sym)));
6395 }
6396
6397 static VALUE
6398 sym_cmp(VALUE sym, VALUE other)
6399 {
6400     if (!SYMBOL_P(other)) {
6401         return Qnil;
6402     }
6403     return rb_str_cmp_m(rb_sym_to_s(sym), rb_sym_to_s(other));
6404 }
6405
6406 static VALUE
6407 sym_casecmp(VALUE sym, VALUE other)
6408 {
6409     if (!SYMBOL_P(other)) {
6410         return Qnil;
6411     }
6412     return rb_str_casecmp(rb_sym_to_s(sym), rb_sym_to_s(other));
6413 }
6414
6415 static VALUE
6416 sym_match(VALUE sym, VALUE other)
6417 {
6418     return rb_str_match(rb_sym_to_s(sym), other);
6419 }
6420
6421 static VALUE
6422 sym_eqq(VALUE sym, VALUE other)
6423 {
6424     if (sym == other) return Qtrue;
6425     return rb_str_equal(rb_sym_to_s(sym), other);
6426 }
6427
6428 static VALUE
6429 sym_aref(int argc, VALUE *argv, VALUE sym)
6430 {
6431     return rb_str_aref_m(argc, argv, rb_sym_to_s(sym));
6432 }
6433
6434 static VALUE
6435 sym_length(VALUE sym)
6436 {
6437     return rb_str_length(rb_id2str(SYM2ID(sym)));
6438 }
6439
6440 static VALUE
6441 sym_empty(VALUE sym)
6442 {
6443     return rb_str_empty(rb_id2str(SYM2ID(sym)));
6444 }
6445
6446 static VALUE
6447 sym_upcase(VALUE sym)
6448 {
6449     return rb_str_intern(rb_str_upcase(rb_id2str(SYM2ID(sym))));
6450 }
6451
6452 static VALUE
6453 sym_downcase(VALUE sym)
6454 {
6455     return rb_str_intern(rb_str_downcase(rb_id2str(SYM2ID(sym))));
6456 }
6457
6458 static VALUE
6459 sym_capitalize(VALUE sym)
6460 {
6461     return rb_str_intern(rb_str_capitalize(rb_id2str(SYM2ID(sym))));
6462 }
6463
6464 static VALUE
6465 sym_swapcase(VALUE sym)
6466 {
6467     return rb_str_intern(rb_str_swapcase(rb_id2str(SYM2ID(sym))));
6468 }
6469
6470 static VALUE
6471 sym_encoding(VALUE sym)
6472 {
6473     return rb_obj_encoding(rb_id2str(SYM2ID(sym)));
6474 }
6475
6476 ID
6477 rb_to_id(VALUE name)
6478 {
6479     VALUE tmp;
6480     ID id;
6481
6482     switch (TYPE(name)) {
6483       default:
6484         tmp = rb_check_string_type(name);
6485         if (NIL_P(tmp)) {
6486             rb_raise(rb_eTypeError, "%s is not a symbol",
6487                      RSTRING_PTR(rb_inspect(name)));
6488         }
6489         name = tmp;
6490         /* fall through */
6491       case T_STRING:
6492         name = rb_str_intern(name);
6493         /* fall through */
6494       case T_SYMBOL:
6495         return SYM2ID(name);
6496     }
6497     return id;
6498 }
6499
6500 /*
6501  *  A <code>String</code> object holds and manipulates an arbitrary sequence of
6502  *  bytes, typically representing characters. String objects may be created
6503  *  using <code>String::new</code> or as literals.
6504  *
6505  *  Because of aliasing issues, users of strings should be aware of the methods
6506  *  that modify the contents of a <code>String</code> object.  Typically,
6507  *  methods with names ending in ``!'' modify their receiver, while those
6508  *  without a ``!'' return a new <code>String</code>.  However, there are
6509  *  exceptions, such as <code>String#[]=</code>.
6510  *
6511  */
6512
6513 void
6514 Init_String(void)
6515 {
6516     rb_cString  = rb_define_class("String", rb_cObject);
6517     rb_include_module(rb_cString, rb_mComparable);
6518     rb_define_alloc_func(rb_cString, str_alloc);
6519     rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1);
6520     rb_define_method(rb_cString, "initialize", rb_str_init, -1);
6521     rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
6522     rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1);
6523     rb_define_method(rb_cString, "==", rb_str_equal, 1);
6524     rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
6525     rb_define_method(rb_cString, "hash", rb_str_hash_m, 0);
6526     rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
6527     rb_define_method(rb_cString, "+", rb_str_plus, 1);
6528     rb_define_method(rb_cString, "*", rb_str_times, 1);
6529     rb_define_method(rb_cString, "%", rb_str_format_m, 1);
6530     rb_define_method(rb_cString, "[]", rb_str_aref_m, -1);
6531     rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1);
6532     rb_define_method(rb_cString, "insert", rb_str_insert, 2);
6533     rb_define_method(rb_cString, "length", rb_str_length, 0);
6534     rb_define_method(rb_cString, "size", rb_str_length, 0);
6535     rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
6536     rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
6537     rb_define_method(rb_cString, "=~", rb_str_match, 1);
6538     rb_define_method(rb_cString, "match", rb_str_match_m, -1);
6539     rb_define_method(rb_cString, "succ", rb_str_succ, 0);
6540     rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0);
6541     rb_define_method(rb_cString, "next", rb_str_succ, 0);
6542     rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0);
6543     rb_define_method(rb_cString, "upto", rb_str_upto, -1);
6544     rb_define_method(rb_cString, "index", rb_str_index_m, -1);
6545     rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1);
6546     rb_define_method(rb_cString, "replace", rb_str_replace, 1);
6547     rb_define_method(rb_cString, "clear", rb_str_clear, 0);
6548     rb_define_method(rb_cString, "chr", rb_str_chr, 0);
6549     rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
6550     rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
6551
6552     rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
6553     rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
6554     rb_define_method(rb_cString, "to_s", rb_str_to_s, 0);
6555     rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
6556     rb_define_method(rb_cString, "inspect", rb_str_inspect, 0);
6557     rb_define_method(rb_cString, "dump", rb_str_dump, 0);
6558
6559     rb_define_method(rb_cString, "upcase", rb_str_upcase, 0);
6560     rb_define_method(rb_cString, "downcase", rb_str_downcase, 0);
6561     rb_define_method(rb_cString, "capitalize", rb_str_capitalize, 0);
6562     rb_define_method(rb_cString, "swapcase", rb_str_swapcase, 0);
6563
6564     rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, 0);
6565     rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, 0);
6566     rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, 0);
6567     rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, 0);
6568
6569     rb_define_method(rb_cString, "hex", rb_str_hex, 0);
6570     rb_define_method(rb_cString, "oct", rb_str_oct, 0);
6571     rb_define_method(rb_cString, "split", rb_str_split_m, -1);
6572     rb_define_method(rb_cString, "lines", rb_str_each_line, -1);
6573     rb_define_method(rb_cString, "bytes", rb_str_each_byte, 0);
6574     rb_define_method(rb_cString, "chars", rb_str_each_char, 0);
6575     rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
6576     rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
6577     rb_define_method(rb_cString, "concat", rb_str_concat, 1);
6578     rb_define_method(rb_cString, "<<", rb_str_concat, 1);
6579     rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
6580     rb_define_method(rb_cString, "intern", rb_str_intern, 0);
6581     rb_define_method(rb_cString, "to_sym", rb_str_intern, 0);
6582     rb_define_method(rb_cString, "ord", rb_str_ord, 0);
6583
6584     rb_define_method(rb_cString, "include?", rb_str_include, 1);
6585     rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
6586     rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
6587
6588     rb_define_method(rb_cString, "scan", rb_str_scan, 1);
6589
6590     rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
6591     rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
6592     rb_define_method(rb_cString, "center", rb_str_center, -1);
6593
6594     rb_define_method(rb_cString, "sub", rb_str_sub, -1);
6595     rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
6596     rb_define_method(rb_cString, "chop", rb_str_chop, 0);
6597     rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
6598     rb_define_method(rb_cString, "strip", rb_str_strip, 0);
6599     rb_define_method(rb_cString, "lstrip", rb_str_lstrip, 0);
6600     rb_define_method(rb_cString, "rstrip", rb_str_rstrip, 0);
6601
6602     rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1);
6603     rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1);
6604     rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0);
6605     rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1);
6606     rb_define_method(rb_cString, "strip!", rb_str_strip_bang, 0);
6607     rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, 0);
6608     rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, 0);
6609
6610     rb_define_method(rb_cString, "tr", rb_str_tr, 2);
6611     rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2);
6612     rb_define_method(rb_cString, "delete", rb_str_delete, -1);
6613     rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
6614     rb_define_method(rb_cString, "count", rb_str_count, -1);
6615
6616     rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2);
6617     rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2);
6618     rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1);
6619     rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1);
6620
6621     rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
6622     rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
6623     rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
6624
6625     rb_define_method(rb_cString, "sum", rb_str_sum, -1);
6626
6627     rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
6628     rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1);
6629
6630     rb_define_method(rb_cString, "partition", rb_str_partition, 1);
6631     rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
6632
6633     rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */
6634     rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
6635     rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
6636     rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0);
6637
6638     id_to_s = rb_intern("to_s");
6639
6640     rb_fs = Qnil;
6641     rb_define_variable("$;", &rb_fs);
6642     rb_define_variable("$-F", &rb_fs);
6643
6644     rb_cSymbol = rb_define_class("Symbol", rb_cObject);
6645     rb_include_module(rb_cSymbol, rb_mComparable);
6646     rb_undef_alloc_func(rb_cSymbol);
6647     rb_undef_method(CLASS_OF(rb_cSymbol), "new");
6648     rb_define_singleton_method(rb_cSymbol, "all_symbols", rb_sym_all_symbols, 0); /* in parse.y */
6649
6650     rb_define_method(rb_cSymbol, "==", sym_equal, 1);
6651     rb_define_method(rb_cSymbol, "to_i", sym_to_i, 0);
6652     rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
6653     rb_define_method(rb_cSymbol, "to_s", rb_sym_to_s, 0);
6654     rb_define_method(rb_cSymbol, "id2name", rb_sym_to_s, 0);
6655     rb_define_method(rb_cSymbol, "intern", sym_to_sym, 0);
6656     rb_define_method(rb_cSymbol, "to_sym", sym_to_sym, 0);
6657     rb_define_method(rb_cSymbol, "to_proc", sym_to_proc, 0);
6658     rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
6659     rb_define_method(rb_cSymbol, "next", sym_succ, 0);
6660
6661     rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
6662     rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
6663     rb_define_method(rb_cSymbol, "=~", sym_match, 1);
6664     rb_define_method(rb_cSymbol, "===", sym_eqq, 1);
6665
6666     rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
6667     rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
6668     rb_define_method(rb_cSymbol, "length", sym_length, 0);
6669     rb_define_method(rb_cSymbol, "size", sym_length, 0);
6670     rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
6671     rb_define_method(rb_cSymbol, "match", sym_match, 1);
6672
6673     rb_define_method(rb_cSymbol, "upcase", sym_upcase, 0);
6674     rb_define_method(rb_cSymbol, "downcase", sym_downcase, 0);
6675     rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, 0);
6676     rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, 0);
6677
6678     rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);
6679 }