string.c

   1 /**********************************************************************
   2
   3   string.c -
   4
   5   $Author$
   6   created at: Mon Aug  9 17:12:58 JST 1993
   7
   8   Copyright (C) 1993-2007 Yukihiro Matsumoto
   9   Copyright (C) 2000  Network Applied Communication Laboratory, Inc.
  10   Copyright (C) 2000  Information-technology Promotion Agency, Japan
  11
  12 **********************************************************************/
  13
  14 #include "ruby/internal/config.h"
  15
  16 #include <ctype.h>
  17 #include <errno.h>
  18 #include <math.h>
  19
  20 #ifdef HAVE_UNISTD_H
  21 # include <unistd.h>
  22 #endif
  23
  24 #include "debug_counter.h"
  25 #include "encindex.h"
  26 #include "gc.h"
  27 #include "id.h"
  28 #include "internal.h"
  29 #include "internal/array.h"
  30 #include "internal/compar.h"
  31 #include "internal/compilers.h"
  32 #include "internal/encoding.h"
  33 #include "internal/error.h"
  34 #include "internal/gc.h"
  35 #include "internal/numeric.h"
  36 #include "internal/object.h"
  37 #include "internal/proc.h"
  38 #include "internal/re.h"
  39 #include "internal/sanitizers.h"
  40 #include "internal/string.h"
  41 #include "internal/transcode.h"
  42 #include "probes.h"
  43 #include "ruby/encoding.h"
  44 #include "ruby/re.h"
  45 #include "ruby/util.h"
  46 #include "ruby_assert.h"
  47 #include "vm_sync.h"
  48
  49 #if defined HAVE_CRYPT_R
  50 # if defined HAVE_CRYPT_H
  51 #  include <crypt.h>
  52 # endif
  53 #elif !defined HAVE_CRYPT
  54 # include "missing/crypt.h"
  55 # define HAVE_CRYPT_R 1
  56 #endif
  57
  58 #define BEG(no) (regs->beg[(no)])
  59 #define END(no) (regs->end[(no)])
  60
  61 #undef rb_str_new
  62 #undef rb_usascii_str_new
  63 #undef rb_utf8_str_new
  64 #undef rb_enc_str_new
  65 #undef rb_str_new_cstr
  66 #undef rb_usascii_str_new_cstr
  67 #undef rb_utf8_str_new_cstr
  68 #undef rb_enc_str_new_cstr
  69 #undef rb_external_str_new_cstr
  70 #undef rb_locale_str_new_cstr
  71 #undef rb_str_dup_frozen
  72 #undef rb_str_buf_new_cstr
  73 #undef rb_str_buf_cat
  74 #undef rb_str_buf_cat2
  75 #undef rb_str_cat2
  76 #undef rb_str_cat_cstr
  77 #undef rb_fstring_cstr
  78
  79 VALUE rb_cString;
  80 VALUE rb_cSymbol;
  81
  82 /* FLAGS of RString
  83  *
  84  * 1:     RSTRING_NOEMBED
  85  * 2:     STR_SHARED (== ELTS_SHARED)
  86  * 2-6:   RSTRING_EMBED_LEN (5 bits == 32)
  87  * 5:     STR_SHARED_ROOT (RSTRING_NOEMBED==1 && STR_SHARED == 0, there may be
  88  *                         other strings that rely on this string's buffer)
  89  * 6:     STR_BORROWED (when RSTRING_NOEMBED==1 && klass==0, unsafe to recycle
  90  *                      early, specific to rb_str_tmp_frozen_{acquire,release})
  91  * 7:     STR_TMPLOCK (set when a pointer to the buffer is passed to syscall
  92  *                     such as read(2). Any modification and realloc is prohibited)
  93  *
  94  * 8-9:   ENC_CODERANGE (2 bits)
  95  * 10-16: ENCODING (7 bits == 128)
  96  * 17:    RSTRING_FSTR
  97  * 18:    STR_NOFREE (do not free this string's buffer when a String is freed.
  98  *                    used for a string object based on C string literal)
  99  * 19:    STR_FAKESTR (when RVALUE is not managed by GC. Typically, the string
 100  *                     object header is temporarily allocated on C stack)
 101  */
 102
 103 #define RUBY_MAX_CHAR_LEN 16
 104 #define STR_SHARED_ROOT FL_USER5
 105 #define STR_BORROWED FL_USER6
 106 #define STR_TMPLOCK FL_USER7
 107 #define STR_NOFREE FL_USER18
 108 #define STR_FAKESTR FL_USER19
 109
 110 #define STR_SET_NOEMBED(str) do {\
 111     FL_SET((str), STR_NOEMBED);\
 112     if (USE_RVARGC) {\
 113         FL_UNSET((str), STR_SHARED | STR_SHARED_ROOT | STR_BORROWED);\
 114     }\
 115     else {\
 116         STR_SET_EMBED_LEN((str), 0);\
 117     }\
 118 } while (0)
 119 #define STR_SET_EMBED(str) FL_UNSET((str), (STR_NOEMBED|STR_NOFREE))
 120 #if USE_RVARGC
 121 # define STR_SET_EMBED_LEN(str, n) do { \
 122     assert(str_embed_capa(str) > (n));\
 123     RSTRING(str)->as.embed.len = (n);\
 124 } while (0)
 125 #else
 126 # define STR_SET_EMBED_LEN(str, n) do { \
 127     long tmp_n = (n);\
 128     RBASIC(str)->flags &= ~RSTRING_EMBED_LEN_MASK;\
 129     RBASIC(str)->flags |= (tmp_n) << RSTRING_EMBED_LEN_SHIFT;\
 130 } while (0)
 131 #endif
 132
 133 #define STR_SET_LEN(str, n) do { \
 134     if (STR_EMBED_P(str)) {\
 135         STR_SET_EMBED_LEN((str), (n));\
 136     }\
 137     else {\
 138         RSTRING(str)->as.heap.len = (n);\
 139     }\
 140 } while (0)
 141
 142 #define STR_DEC_LEN(str) do {\
 143     if (STR_EMBED_P(str)) {\
 144         long n = RSTRING_LEN(str);\
 145         n--;\
 146         STR_SET_EMBED_LEN((str), n);\
 147     }\
 148     else {\
 149         RSTRING(str)->as.heap.len--;\
 150     }\
 151 } while (0)
 152
 153 #define TERM_LEN(str) rb_enc_mbminlen(rb_enc_get(str))
 154 #define TERM_FILL(ptr, termlen) do {\
 155     char *const term_fill_ptr = (ptr);\
 156     const int term_fill_len = (termlen);\
 157     *term_fill_ptr = '\0';\
 158     if (UNLIKELY(term_fill_len > 1))\
 159         memset(term_fill_ptr, 0, term_fill_len);\
 160 } while (0)
 161
 162 #define RESIZE_CAPA(str,capacity) do {\
 163     const int termlen = TERM_LEN(str);\
 164     RESIZE_CAPA_TERM(str,capacity,termlen);\
 165 } while (0)
 166 #define RESIZE_CAPA_TERM(str,capacity,termlen) do {\
 167     if (STR_EMBED_P(str)) {\
 168         if (str_embed_capa(str) < capacity + termlen) {\
 169             char *const tmp = ALLOC_N(char, (size_t)(capacity) + (termlen));\
 170             const long tlen = RSTRING_LEN(str);\
 171             memcpy(tmp, RSTRING_PTR(str), tlen);\
 172             RSTRING(str)->as.heap.ptr = tmp;\
 173             RSTRING(str)->as.heap.len = tlen;\
 174             STR_SET_NOEMBED(str);\
 175             RSTRING(str)->as.heap.aux.capa = (capacity);\
 176         }\
 177     }\
 178     else {\
 179         assert(!FL_TEST((str), STR_SHARED)); \
 180         SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char, \
 181                         (size_t)(capacity) + (termlen), STR_HEAP_SIZE(str)); \
 182         RSTRING(str)->as.heap.aux.capa = (capacity);\
 183     }\
 184 } while (0)
 185
 186 #define STR_SET_SHARED(str, shared_str) do { \
 187     if (!FL_TEST(str, STR_FAKESTR)) { \
 188         assert(RSTRING_PTR(shared_str) <= RSTRING_PTR(str)); \
 189         assert(RSTRING_PTR(str) <= RSTRING_PTR(shared_str) + RSTRING_LEN(shared_str)); \
 190         RB_OBJ_WRITE((str), &RSTRING(str)->as.heap.aux.shared, (shared_str)); \
 191         FL_SET((str), STR_SHARED); \
 192         FL_SET((shared_str), STR_SHARED_ROOT); \
 193         if (RBASIC_CLASS((shared_str)) == 0) /* for CoW-friendliness */ \
 194             FL_SET_RAW((shared_str), STR_BORROWED); \
 195     } \
 196 } while (0)
 197
 198 #define STR_HEAP_PTR(str)  (RSTRING(str)->as.heap.ptr)
 199 #define STR_HEAP_SIZE(str) ((size_t)RSTRING(str)->as.heap.aux.capa + TERM_LEN(str))
 200 /* TODO: include the terminator size in capa. */
 201
 202 #define STR_ENC_GET(str) get_encoding(str)
 203
 204 #if !defined SHARABLE_MIDDLE_SUBSTRING
 205 # define SHARABLE_MIDDLE_SUBSTRING 0
 206 #endif
 207 #if !SHARABLE_MIDDLE_SUBSTRING
 208 #define SHARABLE_SUBSTRING_P(beg, len, end) ((beg) + (len) == (end))
 209 #else
 210 #define SHARABLE_SUBSTRING_P(beg, len, end) 1
 211 #endif
 212
 213
 214 static inline long
 215 str_embed_capa(VALUE str)
 216 {
 217 #if USE_RVARGC
 218     return rb_gc_obj_slot_size(str) - offsetof(struct RString, as.embed.ary);
 219 #else
 220     return RSTRING_EMBED_LEN_MAX + 1;
 221 #endif
 222 }
 223
 224 static inline size_t
 225 str_embed_size(long capa)
 226 {
 227     return offsetof(struct RString, as.embed.ary) + capa;
 228 }
 229
 230 static inline bool
 231 STR_EMBEDDABLE_P(long len, long termlen)
 232 {
 233 #if USE_RVARGC
 234     return rb_gc_size_allocatable_p(str_embed_size(len + termlen));
 235 #else
 236     return len <= RSTRING_EMBED_LEN_MAX + 1 - termlen;
 237 #endif
 238 }
 239
 240 static VALUE str_replace_shared_without_enc(VALUE str2, VALUE str);
 241 static VALUE str_new_frozen(VALUE klass, VALUE orig);
 242 static VALUE str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding);
 243 static VALUE str_new_static(VALUE klass, const char *ptr, long len, int encindex);
 244 static VALUE str_new(VALUE klass, const char *ptr, long len);
 245 static void str_make_independent_expand(VALUE str, long len, long expand, const int termlen);
 246 static inline void str_modifiable(VALUE str);
 247 static VALUE rb_str_downcase(int argc, VALUE *argv, VALUE str);
 248
 249 static inline void
 250 str_make_independent(VALUE str)
 251 {
 252     long len = RSTRING_LEN(str);
 253     int termlen = TERM_LEN(str);
 254     str_make_independent_expand((str), len, 0L, termlen);
 255 }
 256
 257 static inline int str_dependent_p(VALUE str);
 258
 259 void
 260 rb_str_make_independent(VALUE str)
 261 {
 262     if (str_dependent_p(str)) {
 263         str_make_independent(str);
 264     }
 265 }
 266
 267 void
 268 rb_debug_rstring_null_ptr(const char *func)
 269 {
 270     fprintf(stderr, "%s is returning NULL!! "
 271             "SIGSEGV is highly expected to follow immediately. "
 272             "If you could reproduce, attach your debugger here, "
 273             "and look at the passed string.",
 274             func);
 275 }
 276
 277 /* symbols for [up|down|swap]case/capitalize options */
 278 static VALUE sym_ascii, sym_turkic, sym_lithuanian, sym_fold;
 279
 280 static rb_encoding *
 281 get_actual_encoding(const int encidx, VALUE str)
 282 {
 283     const unsigned char *q;
 284
 285     switch (encidx) {
 286       case ENCINDEX_UTF_16:
 287         if (RSTRING_LEN(str) < 2) break;
 288         q = (const unsigned char *)RSTRING_PTR(str);
 289         if (q[0] == 0xFE && q[1] == 0xFF) {
 290             return rb_enc_get_from_index(ENCINDEX_UTF_16BE);
 291         }
 292         if (q[0] == 0xFF && q[1] == 0xFE) {
 293             return rb_enc_get_from_index(ENCINDEX_UTF_16LE);
 294         }
 295         return rb_ascii8bit_encoding();
 296       case ENCINDEX_UTF_32:
 297         if (RSTRING_LEN(str) < 4) break;
 298         q = (const unsigned char *)RSTRING_PTR(str);
 299         if (q[0] == 0 && q[1] == 0 && q[2] == 0xFE && q[3] == 0xFF) {
 300             return rb_enc_get_from_index(ENCINDEX_UTF_32BE);
 301         }
 302         if (q[3] == 0 && q[2] == 0 && q[1] == 0xFE && q[0] == 0xFF) {
 303             return rb_enc_get_from_index(ENCINDEX_UTF_32LE);
 304         }
 305         return rb_ascii8bit_encoding();
 306     }
 307     return rb_enc_from_index(encidx);
 308 }
 309
 310 static rb_encoding *
 311 get_encoding(VALUE str)
 312 {
 313     return get_actual_encoding(ENCODING_GET(str), str);
 314 }
 315
 316 static void
 317 mustnot_broken(VALUE str)
 318 {
 319     if (is_broken_string(str)) {
 320         rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
 321     }
 322 }
 323
 324 static void
 325 mustnot_wchar(VALUE str)
 326 {
 327     rb_encoding *enc = STR_ENC_GET(str);
 328     if (rb_enc_mbminlen(enc) > 1) {
 329         rb_raise(rb_eArgError, "wide char encoding: %s", rb_enc_name(enc));
 330     }
 331 }
 332
 333 static int fstring_cmp(VALUE a, VALUE b);
 334
 335 static VALUE register_fstring(VALUE str, bool copy);
 336
 337 const struct st_hash_type rb_fstring_hash_type = {
 338     fstring_cmp,
 339     rb_str_hash,
 340 };
 341
 342 #define BARE_STRING_P(str) (!FL_ANY_RAW(str, FL_EXIVAR) && RBASIC_CLASS(str) == rb_cString)
 343
 344 struct fstr_update_arg {
 345     VALUE fstr;
 346     bool copy;
 347 };
 348
 349 static int
 350 fstr_update_callback(st_data_t *key, st_data_t *value, st_data_t data, int existing)
 351 {
 352
 353     struct fstr_update_arg *arg = (struct fstr_update_arg *)data;
 354     VALUE str = (VALUE)*key;
 355
 356     if (existing) {
 357         /* because of lazy sweep, str may be unmarked already and swept
 358          * at next time */
 359
 360         if (rb_objspace_garbage_object_p(str)) {
 361             arg->fstr = Qundef;
 362             return ST_DELETE;
 363         }
 364
 365         arg->fstr = str;
 366         return ST_STOP;
 367     }
 368     else {
 369         if (FL_TEST_RAW(str, STR_FAKESTR)) {
 370             if (arg->copy) {
 371                 VALUE new_str = str_new(rb_cString, RSTRING(str)->as.heap.ptr, RSTRING(str)->as.heap.len);
 372                 rb_enc_copy(new_str, str);
 373                 str = new_str;
 374             }
 375             else {
 376                 str = str_new_static(rb_cString, RSTRING(str)->as.heap.ptr,
 377                                      RSTRING(str)->as.heap.len,
 378                                      ENCODING_GET(str));
 379             }
 380             OBJ_FREEZE_RAW(str);
 381         }
 382         else {
 383             if (!OBJ_FROZEN(str))
 384                 str = str_new_frozen(rb_cString, str);
 385             if (STR_SHARED_P(str)) { /* str should not be shared */
 386                 /* shared substring  */
 387                 str_make_independent(str);
 388                 assert(OBJ_FROZEN(str));
 389             }
 390             if (!BARE_STRING_P(str)) {
 391                 str = str_new_frozen(rb_cString, str);
 392             }
 393         }
 394         RBASIC(str)->flags |= RSTRING_FSTR;
 395
 396         *key = *value = arg->fstr = str;
 397         return ST_CONTINUE;
 398     }
 399 }
 400
 401 RUBY_FUNC_EXPORTED
 402 VALUE
 403 rb_fstring(VALUE str)
 404 {
 405     VALUE fstr;
 406     int bare;
 407
 408     Check_Type(str, T_STRING);
 409
 410     if (FL_TEST(str, RSTRING_FSTR))
 411         return str;
 412
 413     bare = BARE_STRING_P(str);
 414     if (!bare) {
 415         if (STR_EMBED_P(str)) {
 416             OBJ_FREEZE_RAW(str);
 417             return str;
 418         }
 419         if (FL_TEST_RAW(str, STR_NOEMBED|STR_SHARED_ROOT|STR_SHARED) == (STR_NOEMBED|STR_SHARED_ROOT)) {
 420             assert(OBJ_FROZEN(str));
 421             return str;
 422         }
 423     }
 424
 425     if (!OBJ_FROZEN(str))
 426         rb_str_resize(str, RSTRING_LEN(str));
 427
 428     fstr = register_fstring(str, FALSE);
 429
 430     if (!bare) {
 431         str_replace_shared_without_enc(str, fstr);
 432         OBJ_FREEZE_RAW(str);
 433         return str;
 434     }
 435     return fstr;
 436 }
 437
 438 static VALUE
 439 register_fstring(VALUE str, bool copy)
 440 {
 441     struct fstr_update_arg args;
 442     args.copy = copy;
 443
 444     RB_VM_LOCK_ENTER();
 445     {
 446         st_table *frozen_strings = rb_vm_fstring_table();
 447         do {
 448             args.fstr = str;
 449             st_update(frozen_strings, (st_data_t)str, fstr_update_callback, (st_data_t)&args);
 450         } while (args.fstr == Qundef);
 451     }
 452     RB_VM_LOCK_LEAVE();
 453
 454     assert(OBJ_FROZEN(args.fstr));
 455     assert(!FL_TEST_RAW(args.fstr, STR_FAKESTR));
 456     assert(!FL_TEST_RAW(args.fstr, FL_EXIVAR));
 457     assert(RBASIC_CLASS(args.fstr) == rb_cString);
 458     return args.fstr;
 459 }
 460
 461 static VALUE
 462 setup_fake_str(struct RString *fake_str, const char *name, long len, int encidx)
 463 {
 464     fake_str->basic.flags = T_STRING|RSTRING_NOEMBED|STR_NOFREE|STR_FAKESTR;
 465     /* SHARED to be allocated by the callback */
 466
 467     if (!name) {
 468         RUBY_ASSERT_ALWAYS(len == 0);
 469         name = "";
 470     }
 471
 472     ENCODING_SET_INLINED((VALUE)fake_str, encidx);
 473
 474     RBASIC_SET_CLASS_RAW((VALUE)fake_str, rb_cString);
 475     fake_str->as.heap.len = len;
 476     fake_str->as.heap.ptr = (char *)name;
 477     fake_str->as.heap.aux.capa = len;
 478     return (VALUE)fake_str;
 479 }
 480
 481 /*
 482  * set up a fake string which refers a static string literal.
 483  */
 484 VALUE
 485 rb_setup_fake_str(struct RString *fake_str, const char *name, long len, rb_encoding *enc)
 486 {
 487     return setup_fake_str(fake_str, name, len, rb_enc_to_index(enc));
 488 }
 489
 490 /*
 491  * rb_fstring_new and rb_fstring_cstr family create or lookup a frozen
 492  * shared string which refers a static string literal.  `ptr` must
 493  * point a constant string.
 494  */
 495 MJIT_FUNC_EXPORTED VALUE
 496 rb_fstring_new(const char *ptr, long len)
 497 {
 498     struct RString fake_str;
 499     return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), FALSE);
 500 }
 501
 502 VALUE
 503 rb_fstring_enc_new(const char *ptr, long len, rb_encoding *enc)
 504 {
 505     struct RString fake_str;
 506     return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), FALSE);
 507 }
 508
 509 VALUE
 510 rb_fstring_cstr(const char *ptr)
 511 {
 512     return rb_fstring_new(ptr, strlen(ptr));
 513 }
 514
 515 static int
 516 fstring_set_class_i(st_data_t key, st_data_t val, st_data_t arg)
 517 {
 518     RBASIC_SET_CLASS((VALUE)key, (VALUE)arg);
 519     return ST_CONTINUE;
 520 }
 521
 522 static int
 523 fstring_cmp(VALUE a, VALUE b)
 524 {
 525     long alen, blen;
 526     const char *aptr, *bptr;
 527     RSTRING_GETMEM(a, aptr, alen);
 528     RSTRING_GETMEM(b, bptr, blen);
 529     return (alen != blen ||
 530             ENCODING_GET(a) != ENCODING_GET(b) ||
 531             memcmp(aptr, bptr, alen) != 0);
 532 }
 533
 534 static inline int
 535 single_byte_optimizable(VALUE str)
 536 {
 537     rb_encoding *enc;
 538
 539     /* Conservative.  It may be ENC_CODERANGE_UNKNOWN. */
 540     if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT)
 541         return 1;
 542
 543     enc = STR_ENC_GET(str);
 544     if (rb_enc_mbmaxlen(enc) == 1)
 545         return 1;
 546
 547     /* Conservative.  Possibly single byte.
 548      * "\xa1" in Shift_JIS for example. */
 549     return 0;
 550 }
 551
 552 VALUE rb_fs;
 553
 554 static inline const char *
 555 search_nonascii(const char *p, const char *e)
 556 {
 557     const uintptr_t *s, *t;
 558
 559 #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
 560 # if SIZEOF_UINTPTR_T == 8
 561 #  define NONASCII_MASK UINT64_C(0x8080808080808080)
 562 # elif SIZEOF_UINTPTR_T == 4
 563 #  define NONASCII_MASK UINT32_C(0x80808080)
 564 # else
 565 #  error "don't know what to do."
 566 # endif
 567 #else
 568 # if SIZEOF_UINTPTR_T == 8
 569 #  define NONASCII_MASK ((uintptr_t)0x80808080UL << 32 | (uintptr_t)0x80808080UL)
 570 # elif SIZEOF_UINTPTR_T == 4
 571 #  define NONASCII_MASK 0x80808080UL /* or...? */
 572 # else
 573 #  error "don't know what to do."
 574 # endif
 575 #endif
 576
 577     if (UNALIGNED_WORD_ACCESS || e - p >= SIZEOF_VOIDP) {
 578 #if !UNALIGNED_WORD_ACCESS
 579         if ((uintptr_t)p % SIZEOF_VOIDP) {
 580             int l = SIZEOF_VOIDP - (uintptr_t)p % SIZEOF_VOIDP;
 581             p += l;
 582             switch (l) {
 583               default: UNREACHABLE;
 584 #if SIZEOF_VOIDP > 4
 585               case 7: if (p[-7]&0x80) return p-7;
 586               case 6: if (p[-6]&0x80) return p-6;
 587               case 5: if (p[-5]&0x80) return p-5;
 588               case 4: if (p[-4]&0x80) return p-4;
 589 #endif
 590               case 3: if (p[-3]&0x80) return p-3;
 591               case 2: if (p[-2]&0x80) return p-2;
 592               case 1: if (p[-1]&0x80) return p-1;
 593               case 0: break;
 594             }
 595         }
 596 #endif
 597 #if defined(HAVE_BUILTIN___BUILTIN_ASSUME_ALIGNED) &&! UNALIGNED_WORD_ACCESS
 598 #define aligned_ptr(value) \
 599         __builtin_assume_aligned((value), sizeof(uintptr_t))
 600 #else
 601 #define aligned_ptr(value) (uintptr_t *)(value)
 602 #endif
 603         s = aligned_ptr(p);
 604         t = (uintptr_t *)(e - (SIZEOF_VOIDP-1));
 605 #undef aligned_ptr
 606         for (;s < t; s++) {
 607             if (*s & NONASCII_MASK) {
 608 #ifdef WORDS_BIGENDIAN
 609                 return (const char *)s + (nlz_intptr(*s&NONASCII_MASK)>>3);
 610 #else
 611                 return (const char *)s + (ntz_intptr(*s&NONASCII_MASK)>>3);
 612 #endif
 613             }
 614         }
 615         p = (const char *)s;
 616     }
 617
 618     switch (e - p) {
 619       default: UNREACHABLE;
 620 #if SIZEOF_VOIDP > 4
 621       case 7: if (e[-7]&0x80) return e-7;
 622       case 6: if (e[-6]&0x80) return e-6;
 623       case 5: if (e[-5]&0x80) return e-5;
 624       case 4: if (e[-4]&0x80) return e-4;
 625 #endif
 626       case 3: if (e[-3]&0x80) return e-3;
 627       case 2: if (e[-2]&0x80) return e-2;
 628       case 1: if (e[-1]&0x80) return e-1;
 629       case 0: return NULL;
 630     }
 631 }
 632
 633 static int
 634 coderange_scan(const char *p, long len, rb_encoding *enc)
 635 {
 636     const char *e = p + len;
 637
 638     if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
 639         /* enc is ASCII-8BIT.  ASCII-8BIT string never be broken. */
 640         p = search_nonascii(p, e);
 641         return p ? ENC_CODERANGE_VALID : ENC_CODERANGE_7BIT;
 642     }
 643
 644     if (rb_enc_asciicompat(enc)) {
 645         p = search_nonascii(p, e);
 646         if (!p) return ENC_CODERANGE_7BIT;
 647         for (;;) {
 648             int ret = rb_enc_precise_mbclen(p, e, enc);
 649             if (!MBCLEN_CHARFOUND_P(ret)) return ENC_CODERANGE_BROKEN;
 650             p += MBCLEN_CHARFOUND_LEN(ret);
 651             if (p == e) break;
 652             p = search_nonascii(p, e);
 653             if (!p) break;
 654         }
 655     }
 656     else {
 657         while (p < e) {
 658             int ret = rb_enc_precise_mbclen(p, e, enc);
 659             if (!MBCLEN_CHARFOUND_P(ret)) return ENC_CODERANGE_BROKEN;
 660             p += MBCLEN_CHARFOUND_LEN(ret);
 661         }
 662     }
 663     return ENC_CODERANGE_VALID;
 664 }
 665
 666 long
 667 rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
 668 {
 669     const char *p = s;
 670
 671     if (*cr == ENC_CODERANGE_BROKEN)
 672         return e - s;
 673
 674     if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
 675         /* enc is ASCII-8BIT.  ASCII-8BIT string never be broken. */
 676         if (*cr == ENC_CODERANGE_VALID) return e - s;
 677         p = search_nonascii(p, e);
 678         *cr = p ? ENC_CODERANGE_VALID : ENC_CODERANGE_7BIT;
 679         return e - s;
 680     }
 681     else if (rb_enc_asciicompat(enc)) {
 682         p = search_nonascii(p, e);
 683         if (!p) {
 684             if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
 685             return e - s;
 686         }
 687         for (;;) {
 688             int ret = rb_enc_precise_mbclen(p, e, enc);
 689             if (!MBCLEN_CHARFOUND_P(ret)) {
 690                 *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN;
 691                 return p - s;
 692             }
 693             p += MBCLEN_CHARFOUND_LEN(ret);
 694             if (p == e) break;
 695             p = search_nonascii(p, e);
 696             if (!p) break;
 697         }
 698     }
 699     else {
 700         while (p < e) {
 701             int ret = rb_enc_precise_mbclen(p, e, enc);
 702             if (!MBCLEN_CHARFOUND_P(ret)) {
 703                 *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN;
 704                 return p - s;
 705             }
 706             p += MBCLEN_CHARFOUND_LEN(ret);
 707         }
 708     }
 709     *cr = ENC_CODERANGE_VALID;
 710     return e - s;
 711 }
 712
 713 static inline void
 714 str_enc_copy(VALUE str1, VALUE str2)
 715 {
 716     rb_enc_set_index(str1, ENCODING_GET(str2));
 717 }
 718
 719 static void
 720 rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
 721 {
 722     /* this function is designed for copying encoding and coderange
 723      * from src to new string "dest" which is made from the part of src.
 724      */
 725     str_enc_copy(dest, src);
 726     if (RSTRING_LEN(dest) == 0) {
 727         if (!rb_enc_asciicompat(STR_ENC_GET(src)))
 728             ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID);
 729         else
 730             ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
 731         return;
 732     }
 733     switch (ENC_CODERANGE(src)) {
 734       case ENC_CODERANGE_7BIT:
 735         ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
 736         break;
 737       case ENC_CODERANGE_VALID:
 738         if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
 739             search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
 740             ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID);
 741         else
 742             ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
 743         break;
 744       default:
 745         break;
 746     }
 747 }
 748
 749 static void
 750 rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
 751 {
 752     str_enc_copy(dest, src);
 753     ENC_CODERANGE_SET(dest, ENC_CODERANGE(src));
 754 }
 755
 756 static int
 757 enc_coderange_scan(VALUE str, rb_encoding *enc, int encidx)
 758 {
 759     if (rb_enc_mbminlen(enc) > 1 && rb_enc_dummy_p(enc) &&
 760         rb_enc_mbminlen(enc = get_actual_encoding(encidx, str)) == 1) {
 761         return ENC_CODERANGE_BROKEN;
 762     }
 763     else {
 764         return coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
 765     }
 766 }
 767
 768 int
 769 rb_enc_str_coderange_scan(VALUE str, rb_encoding *enc)
 770 {
 771     return enc_coderange_scan(str, enc, rb_enc_to_index(enc));
 772 }
 773
 774 int
 775 rb_enc_str_coderange(VALUE str)
 776 {
 777     int cr = ENC_CODERANGE(str);
 778
 779     if (cr == ENC_CODERANGE_UNKNOWN) {
 780         int encidx = ENCODING_GET(str);
 781         rb_encoding *enc = rb_enc_from_index(encidx);
 782         cr = enc_coderange_scan(str, enc, encidx);
 783         ENC_CODERANGE_SET(str, cr);
 784     }
 785     return cr;
 786 }
 787
 788 int
 789 rb_enc_str_asciionly_p(VALUE str)
 790 {
 791     rb_encoding *enc = STR_ENC_GET(str);
 792
 793     if (!rb_enc_asciicompat(enc))
 794         return FALSE;
 795     else if (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT)
 796         return TRUE;
 797     return FALSE;
 798 }
 799
 800 static inline void
 801 str_mod_check(VALUE s, const char *p, long len)
 802 {
 803     if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
 804         rb_raise(rb_eRuntimeError, "string modified");
 805     }
 806 }
 807
 808 static size_t
 809 str_capacity(VALUE str, const int termlen)
 810 {
 811     if (STR_EMBED_P(str)) {
 812 #if USE_RVARGC
 813         return str_embed_capa(str) - termlen;
 814 #else
 815         return (RSTRING_EMBED_LEN_MAX + 1 - termlen);
 816 #endif
 817     }
 818     else if (FL_TEST(str, STR_SHARED|STR_NOFREE)) {
 819         return RSTRING(str)->as.heap.len;
 820     }
 821     else {
 822         return RSTRING(str)->as.heap.aux.capa;
 823     }
 824 }
 825
 826 size_t
 827 rb_str_capacity(VALUE str)
 828 {
 829     return str_capacity(str, TERM_LEN(str));
 830 }
 831
 832 static inline void
 833 must_not_null(const char *ptr)
 834 {
 835     if (!ptr) {
 836         rb_raise(rb_eArgError, "NULL pointer given");
 837     }
 838 }
 839
 840 static inline VALUE
 841 str_alloc(VALUE klass, size_t size)
 842 {
 843     assert(size > 0);
 844     RVARGC_NEWOBJ_OF(str, struct RString, klass,
 845                      T_STRING | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), size);
 846     return (VALUE)str;
 847 }
 848
 849 static inline VALUE
 850 str_alloc_embed(VALUE klass, size_t capa)
 851 {
 852     size_t size = str_embed_size(capa);
 853     assert(rb_gc_size_allocatable_p(size));
 854 #if !USE_RVARGC
 855     assert(size <= sizeof(struct RString));
 856 #endif
 857     return str_alloc(klass, size);
 858 }
 859
 860 static inline VALUE
 861 str_alloc_heap(VALUE klass)
 862 {
 863     return str_alloc(klass, sizeof(struct RString));
 864 }
 865
 866 static inline VALUE
 867 empty_str_alloc(VALUE klass)
 868 {
 869     RUBY_DTRACE_CREATE_HOOK(STRING, 0);
 870     VALUE str = str_alloc_embed(klass, 0);
 871     memset(RSTRING(str)->as.embed.ary, 0, str_embed_capa(str));
 872     return str;
 873 }
 874
 875 static VALUE
 876 str_new0(VALUE klass, const char *ptr, long len, int termlen)
 877 {
 878     VALUE str;
 879
 880     if (len < 0) {
 881         rb_raise(rb_eArgError, "negative string size (or size too big)");
 882     }
 883
 884     RUBY_DTRACE_CREATE_HOOK(STRING, len);
 885
 886     if (STR_EMBEDDABLE_P(len, termlen)) {
 887         str = str_alloc_embed(klass, len + termlen);
 888         if (len == 0) {
 889             ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
 890         }
 891     }
 892     else {
 893         str = str_alloc_heap(klass);
 894         RSTRING(str)->as.heap.aux.capa = len;
 895         /* :FIXME: @shyouhei guesses `len + termlen` is guaranteed to never
 896          * integer overflow.  If we can STATIC_ASSERT that, the following
 897          * mul_add_mul can be reverted to a simple ALLOC_N. */
 898         RSTRING(str)->as.heap.ptr =
 899             rb_xmalloc_mul_add_mul(sizeof(char), len, sizeof(char), termlen);
 900         STR_SET_NOEMBED(str);
 901     }
 902     if (ptr) {
 903         memcpy(RSTRING_PTR(str), ptr, len);
 904     }
 905     STR_SET_LEN(str, len);
 906     TERM_FILL(RSTRING_PTR(str) + len, termlen);
 907     return str;
 908 }
 909
 910 static VALUE
 911 str_new(VALUE klass, const char *ptr, long len)
 912 {
 913     return str_new0(klass, ptr, len, 1);
 914 }
 915
 916 VALUE
 917 rb_str_new(const char *ptr, long len)
 918 {
 919     return str_new(rb_cString, ptr, len);
 920 }
 921
 922 VALUE
 923 rb_usascii_str_new(const char *ptr, long len)
 924 {
 925     VALUE str = rb_str_new(ptr, len);
 926     ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
 927     return str;
 928 }
 929
 930 VALUE
 931 rb_utf8_str_new(const char *ptr, long len)
 932 {
 933     VALUE str = str_new(rb_cString, ptr, len);
 934     rb_enc_associate_index(str, rb_utf8_encindex());
 935     return str;
 936 }
 937
 938 VALUE
 939 rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
 940 {
 941     VALUE str;
 942
 943     if (!enc) return rb_str_new(ptr, len);
 944
 945     str = str_new0(rb_cString, ptr, len, rb_enc_mbminlen(enc));
 946     rb_enc_associate(str, enc);
 947     return str;
 948 }
 949
 950 VALUE
 951 rb_str_new_cstr(const char *ptr)
 952 {
 953     must_not_null(ptr);
 954     /* rb_str_new_cstr() can take pointer from non-malloc-generated
 955      * memory regions, and that cannot be detected by the MSAN.  Just
 956      * trust the programmer that the argument passed here is a sane C
 957      * string. */
 958     __msan_unpoison_string(ptr);
 959     return rb_str_new(ptr, strlen(ptr));
 960 }
 961
 962 VALUE
 963 rb_usascii_str_new_cstr(const char *ptr)
 964 {
 965     VALUE str = rb_str_new_cstr(ptr);
 966     ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
 967     return str;
 968 }
 969
 970 VALUE
 971 rb_utf8_str_new_cstr(const char *ptr)
 972 {
 973     VALUE str = rb_str_new_cstr(ptr);
 974     rb_enc_associate_index(str, rb_utf8_encindex());
 975     return str;
 976 }
 977
 978 VALUE
 979 rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc)
 980 {
 981     must_not_null(ptr);
 982     if (rb_enc_mbminlen(enc) != 1) {
 983         rb_raise(rb_eArgError, "wchar encoding given");
 984     }
 985     return rb_enc_str_new(ptr, strlen(ptr), enc);
 986 }
 987
 988 static VALUE
 989 str_new_static(VALUE klass, const char *ptr, long len, int encindex)
 990 {
 991     VALUE str;
 992
 993     if (len < 0) {
 994         rb_raise(rb_eArgError, "negative string size (or size too big)");
 995     }
 996
 997     if (!ptr) {
 998         rb_encoding *enc = rb_enc_get_from_index(encindex);
 999         str = str_new0(klass, ptr, len, rb_enc_mbminlen(enc));
1000     }
1001     else {
1002         RUBY_DTRACE_CREATE_HOOK(STRING, len);
1003         str = str_alloc_heap(klass);
1004         RSTRING(str)->as.heap.len = len;
1005         RSTRING(str)->as.heap.ptr = (char *)ptr;
1006         RSTRING(str)->as.heap.aux.capa = len;
1007         STR_SET_NOEMBED(str);
1008         RBASIC(str)->flags |= STR_NOFREE;
1009     }
1010     rb_enc_associate_index(str, encindex);
1011     return str;
1012 }
1013
1014 VALUE
1015 rb_str_new_static(const char *ptr, long len)
1016 {
1017     return str_new_static(rb_cString, ptr, len, 0);
1018 }
1019
1020 VALUE
1021 rb_usascii_str_new_static(const char *ptr, long len)
1022 {
1023     return str_new_static(rb_cString, ptr, len, ENCINDEX_US_ASCII);
1024 }
1025
1026 VALUE
1027 rb_utf8_str_new_static(const char *ptr, long len)
1028 {
1029     return str_new_static(rb_cString, ptr, len, ENCINDEX_UTF_8);
1030 }
1031
1032 VALUE
1033 rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
1034 {
1035     return str_new_static(rb_cString, ptr, len, rb_enc_to_index(enc));
1036 }
1037
1038 static VALUE str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1039                                    rb_encoding *from, rb_encoding *to,
1040                                    int ecflags, VALUE ecopts);
1041
1042 static inline bool
1043 is_enc_ascii_string(VALUE str, rb_encoding *enc)
1044 {
1045     int encidx = rb_enc_to_index(enc);
1046     if (rb_enc_get_index(str) == encidx)
1047         return is_ascii_string(str);
1048     return enc_coderange_scan(str, enc, encidx) == ENC_CODERANGE_7BIT;
1049 }
1050
1051 VALUE
1052 rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
1053 {
1054     long len;
1055     const char *ptr;
1056     VALUE newstr;
1057
1058     if (!to) return str;
1059     if (!from) from = rb_enc_get(str);
1060     if (from == to) return str;
1061     if ((rb_enc_asciicompat(to) && is_enc_ascii_string(str, from)) ||
1062         to == rb_ascii8bit_encoding()) {
1063         if (STR_ENC_GET(str) != to) {
1064             str = rb_str_dup(str);
1065             rb_enc_associate(str, to);
1066         }
1067         return str;
1068     }
1069
1070     RSTRING_GETMEM(str, ptr, len);
1071     newstr = str_cat_conv_enc_opts(rb_str_buf_new(len), 0, ptr, len,
1072                                    from, to, ecflags, ecopts);
1073     if (NIL_P(newstr)) {
1074         /* some error, return original */
1075         return str;
1076     }
1077     return newstr;
1078 }
1079
1080 VALUE
1081 rb_str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1082                          rb_encoding *from, int ecflags, VALUE ecopts)
1083 {
1084     long olen;
1085
1086     olen = RSTRING_LEN(newstr);
1087     if (ofs < -olen || olen < ofs)
1088         rb_raise(rb_eIndexError, "index %ld out of string", ofs);
1089     if (ofs < 0) ofs += olen;
1090     if (!from) {
1091         STR_SET_LEN(newstr, ofs);
1092         return rb_str_cat(newstr, ptr, len);
1093     }
1094
1095     rb_str_modify(newstr);
1096     return str_cat_conv_enc_opts(newstr, ofs, ptr, len, from,
1097                                  rb_enc_get(newstr),
1098                                  ecflags, ecopts);
1099 }
1100
1101 VALUE
1102 rb_str_initialize(VALUE str, const char *ptr, long len, rb_encoding *enc)
1103 {
1104     STR_SET_LEN(str, 0);
1105     rb_enc_associate(str, enc);
1106     rb_str_cat(str, ptr, len);
1107     return str;
1108 }
1109
1110 static VALUE
1111 str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1112                       rb_encoding *from, rb_encoding *to,
1113                       int ecflags, VALUE ecopts)
1114 {
1115     rb_econv_t *ec;
1116     rb_econv_result_t ret;
1117     long olen;
1118     VALUE econv_wrapper;
1119     const unsigned char *start, *sp;
1120     unsigned char *dest, *dp;
1121     size_t converted_output = (size_t)ofs;
1122
1123     olen = rb_str_capacity(newstr);
1124
1125     econv_wrapper = rb_obj_alloc(rb_cEncodingConverter);
1126     RBASIC_CLEAR_CLASS(econv_wrapper);
1127     ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
1128     if (!ec) return Qnil;
1129     DATA_PTR(econv_wrapper) = ec;
1130
1131     sp = (unsigned char*)ptr;
1132     start = sp;
1133     while ((dest = (unsigned char*)RSTRING_PTR(newstr)),
1134            (dp = dest + converted_output),
1135            (ret = rb_econv_convert(ec, &sp, start + len, &dp, dest + olen, 0)),
1136            ret == econv_destination_buffer_full) {
1137         /* destination buffer short */
1138         size_t converted_input = sp - start;
1139         size_t rest = len - converted_input;
1140         converted_output = dp - dest;
1141         rb_str_set_len(newstr, converted_output);
1142         if (converted_input && converted_output &&
1143             rest < (LONG_MAX / converted_output)) {
1144             rest = (rest * converted_output) / converted_input;
1145         }
1146         else {
1147             rest = olen;
1148         }
1149         olen += rest < 2 ? 2 : rest;
1150         rb_str_resize(newstr, olen);
1151     }
1152     DATA_PTR(econv_wrapper) = 0;
1153     rb_econv_close(ec);
1154     switch (ret) {
1155       case econv_finished:
1156         len = dp - (unsigned char*)RSTRING_PTR(newstr);
1157         rb_str_set_len(newstr, len);
1158         rb_enc_associate(newstr, to);
1159         return newstr;
1160
1161       default:
1162         return Qnil;
1163     }
1164 }
1165
1166 VALUE
1167 rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
1168 {
1169     return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
1170 }
1171
1172 VALUE
1173 rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *eenc)
1174 {
1175     rb_encoding *ienc;
1176     VALUE str;
1177     const int eidx = rb_enc_to_index(eenc);
1178
1179     if (!ptr) {
1180         return rb_enc_str_new(ptr, len, eenc);
1181     }
1182
1183     /* ASCII-8BIT case, no conversion */
1184     if ((eidx == rb_ascii8bit_encindex()) ||
1185         (eidx == rb_usascii_encindex() && search_nonascii(ptr, ptr + len))) {
1186         return rb_str_new(ptr, len);
1187     }
1188     /* no default_internal or same encoding, no conversion */
1189     ienc = rb_default_internal_encoding();
1190     if (!ienc || eenc == ienc) {
1191         return rb_enc_str_new(ptr, len, eenc);
1192     }
1193     /* ASCII compatible, and ASCII only string, no conversion in
1194      * default_internal */
1195     if ((eidx == rb_ascii8bit_encindex()) ||
1196         (eidx == rb_usascii_encindex()) ||
1197         (rb_enc_asciicompat(eenc) && !search_nonascii(ptr, ptr + len))) {
1198         return rb_enc_str_new(ptr, len, ienc);
1199     }
1200     /* convert from the given encoding to default_internal */
1201     str = rb_enc_str_new(NULL, 0, ienc);
1202     /* when the conversion failed for some reason, just ignore the
1203      * default_internal and result in the given encoding as-is. */
1204     if (NIL_P(rb_str_cat_conv_enc_opts(str, 0, ptr, len, eenc, 0, Qnil))) {
1205         rb_str_initialize(str, ptr, len, eenc);
1206     }
1207     return str;
1208 }
1209
1210 VALUE
1211 rb_external_str_with_enc(VALUE str, rb_encoding *eenc)
1212 {
1213     int eidx = rb_enc_to_index(eenc);
1214     if (eidx == rb_usascii_encindex() &&
1215         rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT) {
1216         rb_enc_associate_index(str, rb_ascii8bit_encindex());
1217         return str;
1218     }
1219     rb_enc_associate_index(str, eidx);
1220     return rb_str_conv_enc(str, eenc, rb_default_internal_encoding());
1221 }
1222
1223 VALUE
1224 rb_external_str_new(const char *ptr, long len)
1225 {
1226     return rb_external_str_new_with_enc(ptr, len, rb_default_external_encoding());
1227 }
1228
1229 VALUE
1230 rb_external_str_new_cstr(const char *ptr)
1231 {
1232     return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_default_external_encoding());
1233 }
1234
1235 VALUE
1236 rb_locale_str_new(const char *ptr, long len)
1237 {
1238     return rb_external_str_new_with_enc(ptr, len, rb_locale_encoding());
1239 }
1240
1241 VALUE
1242 rb_locale_str_new_cstr(const char *ptr)
1243 {
1244     return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_locale_encoding());
1245 }
1246
1247 VALUE
1248 rb_filesystem_str_new(const char *ptr, long len)
1249 {
1250     return rb_external_str_new_with_enc(ptr, len, rb_filesystem_encoding());
1251 }
1252
1253 VALUE
1254 rb_filesystem_str_new_cstr(const char *ptr)
1255 {
1256     return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_filesystem_encoding());
1257 }
1258
1259 VALUE
1260 rb_str_export(VALUE str)
1261 {
1262     return rb_str_export_to_enc(str, rb_default_external_encoding());
1263 }
1264
1265 VALUE
1266 rb_str_export_locale(VALUE str)
1267 {
1268     return rb_str_export_to_enc(str, rb_locale_encoding());
1269 }
1270
1271 VALUE
1272 rb_str_export_to_enc(VALUE str, rb_encoding *enc)
1273 {
1274     return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
1275 }
1276
1277 static VALUE
1278 str_replace_shared_without_enc(VALUE str2, VALUE str)
1279 {
1280     const int termlen = TERM_LEN(str);
1281     char *ptr;
1282     long len;
1283
1284     RSTRING_GETMEM(str, ptr, len);
1285     if (str_embed_capa(str2) >= len + termlen) {
1286         char *ptr2 = RSTRING(str2)->as.embed.ary;
1287         STR_SET_EMBED(str2);
1288         memcpy(ptr2, RSTRING_PTR(str), len);
1289         STR_SET_EMBED_LEN(str2, len);
1290         TERM_FILL(ptr2+len, termlen);
1291     }
1292     else {
1293         VALUE root;
1294         if (STR_SHARED_P(str)) {
1295             root = RSTRING(str)->as.heap.aux.shared;
1296             RSTRING_GETMEM(str, ptr, len);
1297         }
1298         else {
1299             root = rb_str_new_frozen(str);
1300             RSTRING_GETMEM(root, ptr, len);
1301         }
1302         assert(OBJ_FROZEN(root));
1303         if (!STR_EMBED_P(str2) && !FL_TEST_RAW(str2, STR_SHARED|STR_NOFREE)) {
1304             if (FL_TEST_RAW(str2, STR_SHARED_ROOT)) {
1305                 rb_fatal("about to free a possible shared root");
1306             }
1307             char *ptr2 = STR_HEAP_PTR(str2);
1308             if (ptr2 != ptr) {
1309                 ruby_sized_xfree(ptr2, STR_HEAP_SIZE(str2));
1310             }
1311         }
1312         FL_SET(str2, STR_NOEMBED);
1313         RSTRING(str2)->as.heap.len = len;
1314         RSTRING(str2)->as.heap.ptr = ptr;
1315         STR_SET_SHARED(str2, root);
1316     }
1317     return str2;
1318 }
1319
1320 static VALUE
1321 str_replace_shared(VALUE str2, VALUE str)
1322 {
1323     str_replace_shared_without_enc(str2, str);
1324     rb_enc_cr_str_exact_copy(str2, str);
1325     return str2;
1326 }
1327
1328 static VALUE
1329 str_new_shared(VALUE klass, VALUE str)
1330 {
1331     return str_replace_shared(str_alloc_heap(klass), str);
1332 }
1333
1334 VALUE
1335 rb_str_new_shared(VALUE str)
1336 {
1337     return str_new_shared(rb_obj_class(str), str);
1338 }
1339
1340 VALUE
1341 rb_str_new_frozen(VALUE orig)
1342 {
1343     if (OBJ_FROZEN(orig)) return orig;
1344     return str_new_frozen(rb_obj_class(orig), orig);
1345 }
1346
1347 static VALUE
1348 rb_str_new_frozen_String(VALUE orig)
1349 {
1350     if (OBJ_FROZEN(orig) && rb_obj_class(orig) == rb_cString) return orig;
1351     return str_new_frozen(rb_cString, orig);
1352 }
1353
1354 VALUE
1355 rb_str_tmp_frozen_acquire(VALUE orig)
1356 {
1357     if (OBJ_FROZEN_RAW(orig)) return orig;
1358     return str_new_frozen_buffer(0, orig, FALSE);
1359 }
1360
1361 void
1362 rb_str_tmp_frozen_release(VALUE orig, VALUE tmp)
1363 {
1364     if (RBASIC_CLASS(tmp) != 0)
1365         return;
1366
1367     if (STR_EMBED_P(tmp)) {
1368         assert(OBJ_FROZEN_RAW(tmp));
1369     }
1370     else if (FL_TEST_RAW(orig, STR_SHARED) &&
1371             !FL_TEST_RAW(orig, STR_TMPLOCK|RUBY_FL_FREEZE)) {
1372         VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1373
1374         if (shared == tmp && !FL_TEST_RAW(tmp, STR_BORROWED)) {
1375             assert(RSTRING(orig)->as.heap.ptr == RSTRING(tmp)->as.heap.ptr);
1376             assert(RSTRING(orig)->as.heap.len == RSTRING(tmp)->as.heap.len);
1377
1378             /* Unshare orig since the root (tmp) only has this one child. */
1379             FL_UNSET_RAW(orig, STR_SHARED);
1380             RSTRING(orig)->as.heap.aux.capa = RSTRING(tmp)->as.heap.aux.capa;
1381             RBASIC(orig)->flags |= RBASIC(tmp)->flags & STR_NOFREE;
1382             assert(OBJ_FROZEN_RAW(tmp));
1383
1384             /* Make tmp embedded and empty so it is safe for sweeping. */
1385             STR_SET_EMBED(tmp);
1386             STR_SET_EMBED_LEN(tmp, 0);
1387         }
1388     }
1389 }
1390
1391 static VALUE
1392 str_new_frozen(VALUE klass, VALUE orig)
1393 {
1394     return str_new_frozen_buffer(klass, orig, TRUE);
1395 }
1396
1397 static VALUE
1398 heap_str_make_shared(VALUE klass, VALUE orig)
1399 {
1400     assert(!STR_EMBED_P(orig));
1401     assert(!STR_SHARED_P(orig));
1402
1403     VALUE str = str_alloc_heap(klass);
1404     STR_SET_NOEMBED(str);
1405     RSTRING(str)->as.heap.len = RSTRING_LEN(orig);
1406     RSTRING(str)->as.heap.ptr = RSTRING_PTR(orig);
1407     RSTRING(str)->as.heap.aux.capa = RSTRING(orig)->as.heap.aux.capa;
1408     RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1409     RBASIC(orig)->flags &= ~STR_NOFREE;
1410     STR_SET_SHARED(orig, str);
1411     if (klass == 0)
1412         FL_UNSET_RAW(str, STR_BORROWED);
1413     return str;
1414 }
1415
1416 static VALUE
1417 str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding)
1418 {
1419     VALUE str;
1420
1421     long len = RSTRING_LEN(orig);
1422     int termlen = copy_encoding ? TERM_LEN(orig) : 1;
1423
1424     if (STR_EMBED_P(orig) || STR_EMBEDDABLE_P(len, termlen)) {
1425         str = str_new0(klass, RSTRING_PTR(orig), len, termlen);
1426         assert(STR_EMBED_P(str));
1427     }
1428     else {
1429         if (FL_TEST_RAW(orig, STR_SHARED)) {
1430             VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1431             long ofs = RSTRING(orig)->as.heap.ptr - RSTRING_PTR(shared);
1432             long rest = RSTRING_LEN(shared) - ofs - RSTRING(orig)->as.heap.len;
1433             assert(ofs >= 0);
1434             assert(rest >= 0);
1435             assert(ofs + rest <= RSTRING_LEN(shared));
1436 #if !USE_RVARGC
1437             assert(!STR_EMBED_P(shared));
1438 #endif
1439             assert(OBJ_FROZEN(shared));
1440
1441             if ((ofs > 0) || (rest > 0) ||
1442                 (klass != RBASIC(shared)->klass) ||
1443                 ENCODING_GET(shared) != ENCODING_GET(orig)) {
1444                 str = str_new_shared(klass, shared);
1445                 assert(!STR_EMBED_P(str));
1446                 RSTRING(str)->as.heap.ptr += ofs;
1447                 RSTRING(str)->as.heap.len -= ofs + rest;
1448             }
1449             else {
1450                 if (RBASIC_CLASS(shared) == 0)
1451                     FL_SET_RAW(shared, STR_BORROWED);
1452                 return shared;
1453             }
1454         }
1455         else if (STR_EMBEDDABLE_P(RSTRING_LEN(orig), TERM_LEN(orig))) {
1456             str = str_alloc_embed(klass, RSTRING_LEN(orig) + TERM_LEN(orig));
1457             STR_SET_EMBED(str);
1458             memcpy(RSTRING_PTR(str), RSTRING_PTR(orig), RSTRING_LEN(orig));
1459             STR_SET_EMBED_LEN(str, RSTRING_LEN(orig));
1460             TERM_FILL(RSTRING_END(str), TERM_LEN(orig));
1461         }
1462         else {
1463             str = heap_str_make_shared(klass, orig);
1464         }
1465     }
1466
1467     if (copy_encoding) rb_enc_cr_str_exact_copy(str, orig);
1468     OBJ_FREEZE(str);
1469     return str;
1470 }
1471
1472 VALUE
1473 rb_str_new_with_class(VALUE obj, const char *ptr, long len)
1474 {
1475     return str_new0(rb_obj_class(obj), ptr, len, TERM_LEN(obj));
1476 }
1477
1478 static VALUE
1479 str_new_empty_String(VALUE str)
1480 {
1481     VALUE v = rb_str_new(0, 0);
1482     rb_enc_copy(v, str);
1483     return v;
1484 }
1485
1486 #define STR_BUF_MIN_SIZE 63
1487 #if !USE_RVARGC
1488 STATIC_ASSERT(STR_BUF_MIN_SIZE, STR_BUF_MIN_SIZE > RSTRING_EMBED_LEN_MAX);
1489 #endif
1490
1491 VALUE
1492 rb_str_buf_new(long capa)
1493 {
1494     if (STR_EMBEDDABLE_P(capa, 1)) {
1495         return str_alloc_embed(rb_cString, capa + 1);
1496     }
1497
1498     VALUE str = str_alloc_heap(rb_cString);
1499
1500 #if !USE_RVARGC
1501     if (capa < STR_BUF_MIN_SIZE) {
1502         capa = STR_BUF_MIN_SIZE;
1503     }
1504 #endif
1505     FL_SET(str, STR_NOEMBED);
1506     RSTRING(str)->as.heap.aux.capa = capa;
1507     RSTRING(str)->as.heap.ptr = ALLOC_N(char, (size_t)capa + 1);
1508     RSTRING(str)->as.heap.ptr[0] = '\0';
1509
1510     return str;
1511 }
1512
1513 VALUE
1514 rb_str_buf_new_cstr(const char *ptr)
1515 {
1516     VALUE str;
1517     long len = strlen(ptr);
1518
1519     str = rb_str_buf_new(len);
1520     rb_str_buf_cat(str, ptr, len);
1521
1522     return str;
1523 }
1524
1525 VALUE
1526 rb_str_tmp_new(long len)
1527 {
1528     return str_new(0, 0, len);
1529 }
1530
1531 void
1532 rb_str_free(VALUE str)
1533 {
1534     if (FL_TEST(str, RSTRING_FSTR)) {
1535         st_data_t fstr = (st_data_t)str;
1536
1537         RB_VM_LOCK_ENTER();
1538         {
1539             st_delete(rb_vm_fstring_table(), &fstr, NULL);
1540             RB_DEBUG_COUNTER_INC(obj_str_fstr);
1541         }
1542         RB_VM_LOCK_LEAVE();
1543     }
1544
1545     if (STR_EMBED_P(str)) {
1546         RB_DEBUG_COUNTER_INC(obj_str_embed);
1547     }
1548     else if (FL_TEST(str, STR_SHARED | STR_NOFREE)) {
1549         (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_SHARED));
1550         (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_NOFREE));
1551     }
1552     else {
1553         RB_DEBUG_COUNTER_INC(obj_str_ptr);
1554         ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
1555     }
1556 }
1557
1558 RUBY_FUNC_EXPORTED size_t
1559 rb_str_memsize(VALUE str)
1560 {
1561     if (FL_TEST(str, STR_NOEMBED|STR_SHARED|STR_NOFREE) == STR_NOEMBED) {
1562         return STR_HEAP_SIZE(str);
1563     }
1564     else {
1565         return 0;
1566     }
1567 }
1568
1569 VALUE
1570 rb_str_to_str(VALUE str)
1571 {
1572     return rb_convert_type_with_id(str, T_STRING, "String", idTo_str);
1573 }
1574
1575 static inline void str_discard(VALUE str);
1576 static void str_shared_replace(VALUE str, VALUE str2);
1577
1578 void
1579 rb_str_shared_replace(VALUE str, VALUE str2)
1580 {
1581     if (str != str2) str_shared_replace(str, str2);
1582 }
1583
1584 static void
1585 str_shared_replace(VALUE str, VALUE str2)
1586 {
1587     rb_encoding *enc;
1588     int cr;
1589     int termlen;
1590
1591     RUBY_ASSERT(str2 != str);
1592     enc = STR_ENC_GET(str2);
1593     cr = ENC_CODERANGE(str2);
1594     str_discard(str);
1595     termlen = rb_enc_mbminlen(enc);
1596
1597     if (str_embed_capa(str) >= RSTRING_LEN(str2) + termlen) {
1598         STR_SET_EMBED(str);
1599         memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), (size_t)RSTRING_LEN(str2) + termlen);
1600         STR_SET_EMBED_LEN(str, RSTRING_LEN(str2));
1601         rb_enc_associate(str, enc);
1602         ENC_CODERANGE_SET(str, cr);
1603     }
1604     else {
1605 #if USE_RVARGC
1606         if (STR_EMBED_P(str2)) {
1607             assert(!FL_TEST(str2, STR_SHARED));
1608             long len = RSTRING(str2)->as.embed.len;
1609             assert(len + termlen <= str_embed_capa(str2));
1610
1611             char *new_ptr = ALLOC_N(char, len + termlen);
1612             memcpy(new_ptr, RSTRING(str2)->as.embed.ary, len + termlen);
1613             RSTRING(str2)->as.heap.ptr = new_ptr;
1614             RSTRING(str2)->as.heap.len = len;
1615             RSTRING(str2)->as.heap.aux.capa = len;
1616             STR_SET_NOEMBED(str2);
1617         }
1618 #endif
1619
1620         STR_SET_NOEMBED(str);
1621         FL_UNSET(str, STR_SHARED);
1622         RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1623         RSTRING(str)->as.heap.len = RSTRING_LEN(str2);
1624
1625         if (FL_TEST(str2, STR_SHARED)) {
1626             VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1627             STR_SET_SHARED(str, shared);
1628         }
1629         else {
1630             RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
1631         }
1632
1633         /* abandon str2 */
1634         STR_SET_EMBED(str2);
1635         RSTRING_PTR(str2)[0] = 0;
1636         STR_SET_EMBED_LEN(str2, 0);
1637         rb_enc_associate(str, enc);
1638         ENC_CODERANGE_SET(str, cr);
1639     }
1640 }
1641
1642 VALUE
1643 rb_obj_as_string(VALUE obj)
1644 {
1645     VALUE str;
1646
1647     if (RB_TYPE_P(obj, T_STRING)) {
1648         return obj;
1649     }
1650     str = rb_funcall(obj, idTo_s, 0);
1651     return rb_obj_as_string_result(str, obj);
1652 }
1653
1654 MJIT_FUNC_EXPORTED VALUE
1655 rb_obj_as_string_result(VALUE str, VALUE obj)
1656 {
1657     if (!RB_TYPE_P(str, T_STRING))
1658         return rb_any_to_s(obj);
1659     return str;
1660 }
1661
1662 static VALUE
1663 str_replace(VALUE str, VALUE str2)
1664 {
1665     long len;
1666
1667     len = RSTRING_LEN(str2);
1668     if (STR_SHARED_P(str2)) {
1669         VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1670         assert(OBJ_FROZEN(shared));
1671         STR_SET_NOEMBED(str);
1672         RSTRING(str)->as.heap.len = len;
1673         RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1674         STR_SET_SHARED(str, shared);
1675         rb_enc_cr_str_exact_copy(str, str2);
1676     }
1677     else {
1678         str_replace_shared(str, str2);
1679     }
1680
1681     return str;
1682 }
1683
1684 static inline VALUE
1685 ec_str_alloc(struct rb_execution_context_struct *ec, VALUE klass, size_t size)
1686 {
1687     assert(size > 0);
1688     RB_RVARGC_EC_NEWOBJ_OF(ec, str, struct RString, klass,
1689                            T_STRING | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), size);
1690     return (VALUE)str;
1691 }
1692
1693 static inline VALUE
1694 ec_str_alloc_embed(struct rb_execution_context_struct *ec, VALUE klass, size_t capa)
1695 {
1696     size_t size = str_embed_size(capa);
1697     assert(rb_gc_size_allocatable_p(size));
1698 #if !USE_RVARGC
1699     assert(size <= sizeof(struct RString));
1700 #endif
1701     return ec_str_alloc(ec, klass, size);
1702 }
1703
1704 static inline VALUE
1705 ec_str_alloc_heap(struct rb_execution_context_struct *ec, VALUE klass)
1706 {
1707     return ec_str_alloc(ec, klass, sizeof(struct RString));
1708 }
1709
1710 static inline VALUE
1711 str_duplicate_setup(VALUE klass, VALUE str, VALUE dup)
1712 {
1713     const VALUE flag_mask =
1714 #if !USE_RVARGC
1715         RSTRING_NOEMBED | RSTRING_EMBED_LEN_MASK |
1716 #endif
1717         ENC_CODERANGE_MASK | ENCODING_MASK |
1718         FL_FREEZE
1719         ;
1720     VALUE flags = FL_TEST_RAW(str, flag_mask);
1721     int encidx = 0;
1722     if (STR_EMBED_P(str)) {
1723         long len = RSTRING_EMBED_LEN(str);
1724
1725         assert(str_embed_capa(dup) >= len + 1);
1726         STR_SET_EMBED_LEN(dup, len);
1727         MEMCPY(RSTRING(dup)->as.embed.ary, RSTRING(str)->as.embed.ary, char, len + 1);
1728     }
1729     else {
1730         VALUE root = str;
1731         if (FL_TEST_RAW(str, STR_SHARED)) {
1732             root = RSTRING(str)->as.heap.aux.shared;
1733         }
1734         else if (UNLIKELY(!(flags & FL_FREEZE))) {
1735             root = str = str_new_frozen(klass, str);
1736             flags = FL_TEST_RAW(str, flag_mask);
1737         }
1738         assert(!STR_SHARED_P(root));
1739         assert(RB_OBJ_FROZEN_RAW(root));
1740 #if USE_RVARGC
1741         if (1) {
1742 #else
1743         if (STR_EMBED_P(root)) {
1744             MEMCPY(RSTRING(dup)->as.embed.ary, RSTRING(root)->as.embed.ary,
1745                    char, RSTRING_EMBED_LEN_MAX + 1);
1746         }
1747         else {
1748 #endif
1749             RSTRING(dup)->as.heap.len = RSTRING_LEN(str);
1750             RSTRING(dup)->as.heap.ptr = RSTRING_PTR(str);
1751             RB_OBJ_WRITE(dup, &RSTRING(dup)->as.heap.aux.shared, root);
1752             flags |= RSTRING_NOEMBED | STR_SHARED;
1753         }
1754     }
1755
1756     if ((flags & ENCODING_MASK) == (ENCODING_INLINE_MAX<<ENCODING_SHIFT)) {
1757         encidx = rb_enc_get_index(str);
1758         flags &= ~ENCODING_MASK;
1759     }
1760     FL_SET_RAW(dup, flags & ~FL_FREEZE);
1761     if (encidx) rb_enc_associate_index(dup, encidx);
1762     return dup;
1763 }
1764
1765 static inline VALUE
1766 ec_str_duplicate(struct rb_execution_context_struct *ec, VALUE klass, VALUE str)
1767 {
1768     VALUE dup;
1769     if (!USE_RVARGC || FL_TEST(str, STR_NOEMBED)) {
1770         dup = ec_str_alloc_heap(ec, klass);
1771     }
1772     else {
1773         dup = ec_str_alloc_embed(ec, klass, RSTRING_EMBED_LEN(str) + TERM_LEN(str));
1774     }
1775
1776     return str_duplicate_setup(klass, str, dup);
1777 }
1778
1779 static inline VALUE
1780 str_duplicate(VALUE klass, VALUE str)
1781 {
1782     VALUE dup;
1783     if (!USE_RVARGC || FL_TEST(str, STR_NOEMBED)) {
1784         dup = str_alloc_heap(klass);
1785     }
1786     else {
1787        dup = str_alloc_embed(klass, RSTRING_EMBED_LEN(str) + TERM_LEN(str));
1788     }
1789
1790     return str_duplicate_setup(klass, str, dup);
1791 }
1792
1793 VALUE
1794 rb_str_dup(VALUE str)
1795 {
1796     return str_duplicate(rb_obj_class(str), str);
1797 }
1798
1799 VALUE
1800 rb_str_resurrect(VALUE str)
1801 {
1802     RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1803     return str_duplicate(rb_cString, str);
1804 }
1805
1806 VALUE
1807 rb_ec_str_resurrect(struct rb_execution_context_struct *ec, VALUE str)
1808 {
1809     RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1810     return ec_str_duplicate(ec, rb_cString, str);
1811 }
1812
1813 /*
1814  *  call-seq:
1815  *    String.new(string = '') -> new_string
1816  *    String.new(string = '', encoding: encoding) -> new_string
1817  *    String.new(string = '', capacity: size) -> new_string
1818  *
1819  *  Returns a new \String that is a copy of +string+.
1820  *
1821  *  With no arguments, returns the empty string with the Encoding <tt>ASCII-8BIT</tt>:
1822  *    s = String.new
1823  *    s # => ""
1824  *    s.encoding # => #<Encoding:ASCII-8BIT>
1825  *
1826  *  With the single \String argument +string+, returns a copy of +string+
1827  *  with the same encoding as +string+:
1828  *    s = String.new("Que veut dire \u{e7}a?")
1829  *    s # => "Que veut dire \u{e7}a?"
1830  *    s.encoding # => #<Encoding:UTF-8>
1831  *
1832  *  Literal strings like <tt>""</tt> or here-documents always use
1833  *  {script encoding}[Encoding.html#class-Encoding-label-Script+encoding], unlike String.new.
1834  *
1835  *  With keyword +encoding+, returns a copy of +str+
1836  *  with the specified encoding:
1837  *    s = String.new(encoding: 'ASCII')
1838  *    s.encoding # => #<Encoding:US-ASCII>
1839  *    s = String.new('foo', encoding: 'ASCII')
1840  *    s.encoding # => #<Encoding:US-ASCII>
1841  *
1842  *  Note that these are equivalent:
1843  *    s0 = String.new('foo', encoding: 'ASCII')
1844  *    s1 = 'foo'.force_encoding('ASCII')
1845  *    s0.encoding == s1.encoding # => true
1846  *
1847  *  With keyword +capacity+, returns a copy of +str+;
1848  *  the given +capacity+ may set the size of the internal buffer,
1849  *  which may affect performance:
1850  *    String.new(capacity: 1) # => ""
1851  *    String.new(capacity: 4096) # => ""
1852  *
1853  *  The +string+, +encoding+, and +capacity+ arguments may all be used together:
1854  *
1855  *    String.new('hello', encoding: 'UTF-8', capacity: 25)
1856  *
1857  */
1858
1859 static VALUE
1860 rb_str_init(int argc, VALUE *argv, VALUE str)
1861 {
1862     static ID keyword_ids[2];
1863     VALUE orig, opt, venc, vcapa;
1864     VALUE kwargs[2];
1865     rb_encoding *enc = 0;
1866     int n;
1867
1868     if (!keyword_ids[0]) {
1869         keyword_ids[0] = rb_id_encoding();
1870         CONST_ID(keyword_ids[1], "capacity");
1871     }
1872
1873     n = rb_scan_args(argc, argv, "01:", &orig, &opt);
1874     if (!NIL_P(opt)) {
1875         rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
1876         venc = kwargs[0];
1877         vcapa = kwargs[1];
1878         if (venc != Qundef && !NIL_P(venc)) {
1879             enc = rb_to_encoding(venc);
1880         }
1881         if (vcapa != Qundef && !NIL_P(vcapa)) {
1882             long capa = NUM2LONG(vcapa);
1883             long len = 0;
1884             int termlen = enc ? rb_enc_mbminlen(enc) : 1;
1885
1886             if (capa < STR_BUF_MIN_SIZE) {
1887                 capa = STR_BUF_MIN_SIZE;
1888             }
1889             if (n == 1) {
1890                 StringValue(orig);
1891                 len = RSTRING_LEN(orig);
1892                 if (capa < len) {
1893                     capa = len;
1894                 }
1895                 if (orig == str) n = 0;
1896             }
1897             str_modifiable(str);
1898             if (STR_EMBED_P(str)) { /* make noembed always */
1899                 char *new_ptr = ALLOC_N(char, (size_t)capa + termlen);
1900 #if USE_RVARGC
1901                 assert(RSTRING(str)->as.embed.len + 1 <= str_embed_capa(str));
1902                 memcpy(new_ptr, RSTRING(str)->as.embed.ary, RSTRING(str)->as.embed.len + 1);
1903 #else
1904                 memcpy(new_ptr, RSTRING(str)->as.embed.ary, RSTRING_EMBED_LEN_MAX + 1);
1905 #endif
1906                 RSTRING(str)->as.heap.ptr = new_ptr;
1907             }
1908             else if (FL_TEST(str, STR_SHARED|STR_NOFREE)) {
1909                 const size_t size = (size_t)capa + termlen;
1910                 const char *const old_ptr = RSTRING_PTR(str);
1911                 const size_t osize = RSTRING(str)->as.heap.len + TERM_LEN(str);
1912                 char *new_ptr = ALLOC_N(char, (size_t)capa + termlen);
1913                 memcpy(new_ptr, old_ptr, osize < size ? osize : size);
1914                 FL_UNSET_RAW(str, STR_SHARED|STR_NOFREE);
1915                 RSTRING(str)->as.heap.ptr = new_ptr;
1916             }
1917             else if (STR_HEAP_SIZE(str) != (size_t)capa + termlen) {
1918                 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
1919                         (size_t)capa + termlen, STR_HEAP_SIZE(str));
1920             }
1921             RSTRING(str)->as.heap.len = len;
1922             TERM_FILL(&RSTRING(str)->as.heap.ptr[len], termlen);
1923             if (n == 1) {
1924                 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), len);
1925                 rb_enc_cr_str_exact_copy(str, orig);
1926             }
1927             FL_SET(str, STR_NOEMBED);
1928             RSTRING(str)->as.heap.aux.capa = capa;
1929         }
1930         else if (n == 1) {
1931             rb_str_replace(str, orig);
1932         }
1933         if (enc) {
1934             rb_enc_associate(str, enc);
1935             ENC_CODERANGE_CLEAR(str);
1936         }
1937     }
1938     else if (n == 1) {
1939         rb_str_replace(str, orig);
1940     }
1941     return str;
1942 }
1943
1944 #ifdef NONASCII_MASK
1945 #define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
1946
1947 /*
1948  * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx
1949  * bit representation. (see https://en.wikipedia.org/wiki/UTF-8)
1950  * Therefore, the following pseudocode can detect UTF-8 leading bytes.
1951  *
1952  * if (!(byte & 0x80))
1953  *   byte |= 0x40;          // turn on bit6
1954  * return ((byte>>6) & 1);  // bit6 represent whether this byte is leading or not.
1955  *
1956  * This function calculates whether a byte is leading or not for all bytes
1957  * in the argument word by concurrently using the above logic, and then
1958  * adds up the number of leading bytes in the word.
1959  */
1960 static inline uintptr_t
1961 count_utf8_lead_bytes_with_word(const uintptr_t *s)
1962 {
1963     uintptr_t d = *s;
1964
1965     /* Transform so that bit0 indicates whether we have a UTF-8 leading byte or not. */
1966     d = (d>>6) | (~d>>7);
1967     d &= NONASCII_MASK >> 7;
1968
1969     /* Gather all bytes. */
1970 #if defined(HAVE_BUILTIN___BUILTIN_POPCOUNT) && defined(__POPCNT__)
1971     /* use only if it can use POPCNT */
1972     return rb_popcount_intptr(d);
1973 #else
1974     d += (d>>8);
1975     d += (d>>16);
1976 # if SIZEOF_VOIDP == 8
1977     d += (d>>32);
1978 # endif
1979     return (d&0xF);
1980 #endif
1981 }
1982 #endif
1983
1984 static inline long
1985 enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
1986 {
1987     long c;
1988     const char *q;
1989
1990     if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
1991         long diff = (long)(e - p);
1992         return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
1993     }
1994 #ifdef NONASCII_MASK
1995     else if (cr == ENC_CODERANGE_VALID && enc == rb_utf8_encoding()) {
1996         uintptr_t len = 0;
1997         if ((int)sizeof(uintptr_t) * 2 < e - p) {
1998             const uintptr_t *s, *t;
1999             const uintptr_t lowbits = sizeof(uintptr_t) - 1;
2000             s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2001             t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2002             while (p < (const char *)s) {
2003                 if (is_utf8_lead_byte(*p)) len++;
2004                 p++;
2005             }
2006             while (s < t) {
2007                 len += count_utf8_lead_bytes_with_word(s);
2008                 s++;
2009             }
2010             p = (const char *)s;
2011         }
2012         while (p < e) {
2013             if (is_utf8_lead_byte(*p)) len++;
2014             p++;
2015         }
2016         return (long)len;
2017     }
2018 #endif
2019     else if (rb_enc_asciicompat(enc)) {
2020         c = 0;
2021         if (ENC_CODERANGE_CLEAN_P(cr)) {
2022             while (p < e) {
2023                 if (ISASCII(*p)) {
2024                     q = search_nonascii(p, e);
2025                     if (!q)
2026                         return c + (e - p);
2027                     c += q - p;
2028                     p = q;
2029                 }
2030                 p += rb_enc_fast_mbclen(p, e, enc);
2031                 c++;
2032             }
2033         }
2034         else {
2035             while (p < e) {
2036                 if (ISASCII(*p)) {
2037                     q = search_nonascii(p, e);
2038                     if (!q)
2039                         return c + (e - p);
2040                     c += q - p;
2041                     p = q;
2042                 }
2043                 p += rb_enc_mbclen(p, e, enc);
2044                 c++;
2045             }
2046         }
2047         return c;
2048     }
2049
2050     for (c=0; p<e; c++) {
2051         p += rb_enc_mbclen(p, e, enc);
2052     }
2053     return c;
2054 }
2055
2056 long
2057 rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
2058 {
2059     return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
2060 }
2061
2062 /* To get strlen with cr
2063  * Note that given cr is not used.
2064  */
2065 long
2066 rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
2067 {
2068     long c;
2069     const char *q;
2070     int ret;
2071
2072     *cr = 0;
2073     if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2074         long diff = (long)(e - p);
2075         return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2076     }
2077     else if (rb_enc_asciicompat(enc)) {
2078         c = 0;
2079         while (p < e) {
2080             if (ISASCII(*p)) {
2081                 q = search_nonascii(p, e);
2082                 if (!q) {
2083                     if (!*cr) *cr = ENC_CODERANGE_7BIT;
2084                     return c + (e - p);
2085                 }
2086                 c += q - p;
2087                 p = q;
2088             }
2089             ret = rb_enc_precise_mbclen(p, e, enc);
2090             if (MBCLEN_CHARFOUND_P(ret)) {
2091                 *cr |= ENC_CODERANGE_VALID;
2092                 p += MBCLEN_CHARFOUND_LEN(ret);
2093             }
2094             else {
2095                 *cr = ENC_CODERANGE_BROKEN;
2096                 p++;
2097             }
2098             c++;
2099         }
2100         if (!*cr) *cr = ENC_CODERANGE_7BIT;
2101         return c;
2102     }
2103
2104     for (c=0; p<e; c++) {
2105         ret = rb_enc_precise_mbclen(p, e, enc);
2106         if (MBCLEN_CHARFOUND_P(ret)) {
2107             *cr |= ENC_CODERANGE_VALID;
2108             p += MBCLEN_CHARFOUND_LEN(ret);
2109         }
2110         else {
2111             *cr = ENC_CODERANGE_BROKEN;
2112             if (p + rb_enc_mbminlen(enc) <= e)
2113                 p += rb_enc_mbminlen(enc);
2114             else
2115                 p = e;
2116         }
2117     }
2118     if (!*cr) *cr = ENC_CODERANGE_7BIT;
2119     return c;
2120 }
2121
2122 /* enc must be str's enc or rb_enc_check(str, str2) */
2123 static long
2124 str_strlen(VALUE str, rb_encoding *enc)
2125 {
2126     const char *p, *e;
2127     int cr;
2128
2129     if (single_byte_optimizable(str)) return RSTRING_LEN(str);
2130     if (!enc) enc = STR_ENC_GET(str);
2131     p = RSTRING_PTR(str);
2132     e = RSTRING_END(str);
2133     cr = ENC_CODERANGE(str);
2134
2135     if (cr == ENC_CODERANGE_UNKNOWN) {
2136         long n = rb_enc_strlen_cr(p, e, enc, &cr);
2137         if (cr) ENC_CODERANGE_SET(str, cr);
2138         return n;
2139     }
2140     else {
2141         return enc_strlen(p, e, enc, cr);
2142     }
2143 }
2144
2145 long
2146 rb_str_strlen(VALUE str)
2147 {
2148     return str_strlen(str, NULL);
2149 }
2150
2151 /*
2152  *  call-seq:
2153  *    length -> integer
2154  *
2155  *  Returns the count of characters (not bytes) in +self+:
2156  *
2157  *    "\x80\u3042".length # => 2
2158  *    "hello".length # => 5
2159  *
2160  *  String#size is an alias for String#length.
2161  *
2162  *  Related: String#bytesize.
2163  */
2164
2165 VALUE
2166 rb_str_length(VALUE str)
2167 {
2168     return LONG2NUM(str_strlen(str, NULL));
2169 }
2170
2171 /*
2172  *  call-seq:
2173  *    bytesize -> integer
2174  *
2175  *  Returns the count  of bytes in +self+:
2176  *
2177  *    "\x80\u3042".bytesize # => 4
2178  *    "hello".bytesize # => 5
2179  *
2180  *  Related: String#length.
2181  */
2182
2183 static VALUE
2184 rb_str_bytesize(VALUE str)
2185 {
2186     return LONG2NUM(RSTRING_LEN(str));
2187 }
2188
2189 /*
2190  *  call-seq:
2191  *    empty? -> true or false
2192  *
2193  *  Returns +true+ if the length of +self+ is zero, +false+ otherwise:
2194  *
2195  *    "hello".empty? # => false
2196  *    " ".empty? # => false
2197  *    "".empty? # => true
2198  *
2199  */
2200
2201 static VALUE
2202 rb_str_empty(VALUE str)
2203 {
2204     return RBOOL(RSTRING_LEN(str) == 0);
2205 }
2206
2207 /*
2208  *  call-seq:
2209  *    string + other_string -> new_string
2210  *
2211  *  Returns a new \String containing +other_string+ concatenated to +self+:
2212  *
2213  *    "Hello from " + self.to_s # => "Hello from main"
2214  *
2215  */
2216
2217 VALUE
2218 rb_str_plus(VALUE str1, VALUE str2)
2219 {
2220     VALUE str3;
2221     rb_encoding *enc;
2222     char *ptr1, *ptr2, *ptr3;
2223     long len1, len2;
2224     int termlen;
2225
2226     StringValue(str2);
2227     enc = rb_enc_check_str(str1, str2);
2228     RSTRING_GETMEM(str1, ptr1, len1);
2229     RSTRING_GETMEM(str2, ptr2, len2);
2230     termlen = rb_enc_mbminlen(enc);
2231     if (len1 > LONG_MAX - len2) {
2232         rb_raise(rb_eArgError, "string size too big");
2233     }
2234     str3 = str_new0(rb_cString, 0, len1+len2, termlen);
2235     ptr3 = RSTRING_PTR(str3);
2236     memcpy(ptr3, ptr1, len1);
2237     memcpy(ptr3+len1, ptr2, len2);
2238     TERM_FILL(&ptr3[len1+len2], termlen);
2239
2240     ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc),
2241                            ENC_CODERANGE_AND(ENC_CODERANGE(str1), ENC_CODERANGE(str2)));
2242     RB_GC_GUARD(str1);
2243     RB_GC_GUARD(str2);
2244     return str3;
2245 }
2246
2247 /* A variant of rb_str_plus that does not raise but return Qundef instead. */
2248 MJIT_FUNC_EXPORTED VALUE
2249 rb_str_opt_plus(VALUE str1, VALUE str2)
2250 {
2251     assert(RBASIC_CLASS(str1) == rb_cString);
2252     assert(RBASIC_CLASS(str2) == rb_cString);
2253     long len1, len2;
2254     MAYBE_UNUSED(char) *ptr1, *ptr2;
2255     RSTRING_GETMEM(str1, ptr1, len1);
2256     RSTRING_GETMEM(str2, ptr2, len2);
2257     int enc1 = rb_enc_get_index(str1);
2258     int enc2 = rb_enc_get_index(str2);
2259
2260     if (enc1 < 0) {
2261         return Qundef;
2262     }
2263     else if (enc2 < 0) {
2264         return Qundef;
2265     }
2266     else if (enc1 != enc2) {
2267         return Qundef;
2268     }
2269     else if (len1 > LONG_MAX - len2) {
2270         return Qundef;
2271     }
2272     else {
2273         return rb_str_plus(str1, str2);
2274     }
2275
2276 }
2277
2278 /*
2279  *  call-seq:
2280  *    string * integer -> new_string
2281  *
2282  *  Returns a new \String containing +integer+ copies of +self+:
2283  *
2284  *    "Ho! " * 3 # => "Ho! Ho! Ho! "
2285  *    "Ho! " * 0 # => ""
2286  *
2287  */
2288
2289 VALUE
2290 rb_str_times(VALUE str, VALUE times)
2291 {
2292     VALUE str2;
2293     long n, len;
2294     char *ptr2;
2295     int termlen;
2296
2297     if (times == INT2FIX(1)) {
2298         return str_duplicate(rb_cString, str);
2299     }
2300     if (times == INT2FIX(0)) {
2301         str2 = str_alloc_embed(rb_cString, 0);
2302         rb_enc_copy(str2, str);
2303         return str2;
2304     }
2305     len = NUM2LONG(times);
2306     if (len < 0) {
2307         rb_raise(rb_eArgError, "negative argument");
2308     }
2309     if (RSTRING_LEN(str) == 1 && RSTRING_PTR(str)[0] == 0) {
2310         if (STR_EMBEDDABLE_P(len, 1)) {
2311             str2 = str_alloc_embed(rb_cString, len + 1);
2312             memset(RSTRING_PTR(str2), 0, len + 1);
2313         }
2314         else {
2315             str2 = str_alloc_heap(rb_cString);
2316             RSTRING(str2)->as.heap.aux.capa = len;
2317             RSTRING(str2)->as.heap.ptr = ZALLOC_N(char, (size_t)len + 1);
2318             STR_SET_NOEMBED(str2);
2319         }
2320         STR_SET_LEN(str2, len);
2321         rb_enc_copy(str2, str);
2322         return str2;
2323     }
2324     if (len && LONG_MAX/len <  RSTRING_LEN(str)) {
2325         rb_raise(rb_eArgError, "argument too big");
2326     }
2327
2328     len *= RSTRING_LEN(str);
2329     termlen = TERM_LEN(str);
2330     str2 = str_new0(rb_cString, 0, len, termlen);
2331     ptr2 = RSTRING_PTR(str2);
2332     if (len) {
2333         n = RSTRING_LEN(str);
2334         memcpy(ptr2, RSTRING_PTR(str), n);
2335         while (n <= len/2) {
2336             memcpy(ptr2 + n, ptr2, n);
2337             n *= 2;
2338         }
2339         memcpy(ptr2 + n, ptr2, len-n);
2340     }
2341     STR_SET_LEN(str2, len);
2342     TERM_FILL(&ptr2[len], termlen);
2343     rb_enc_cr_str_copy_for_substr(str2, str);
2344
2345     return str2;
2346 }
2347
2348 /*
2349  *  call-seq:
2350  *    string % object -> new_string
2351  *
2352  *  Returns the result of formatting +object+ into the format specification +self+
2353  *  (see Kernel#sprintf for formatting details):
2354  *
2355  *    "%05d" % 123 # => "00123"
2356  *
2357  *  If +self+ contains multiple substitutions, +object+ must be
2358  *  an \Array or \Hash containing the values to be substituted:
2359  *
2360  *    "%-5s: %016x" % [ "ID", self.object_id ] # => "ID   : 00002b054ec93168"
2361  *    "foo = %{foo}" % {foo: 'bar'} # => "foo = bar"
2362  *    "foo = %{foo}, baz = %{baz}" % {foo: 'bar', baz: 'bat'} # => "foo = bar, baz = bat"
2363  *
2364  */
2365
2366 static VALUE
2367 rb_str_format_m(VALUE str, VALUE arg)
2368 {
2369     VALUE tmp = rb_check_array_type(arg);
2370
2371     if (!NIL_P(tmp)) {
2372         return rb_str_format(RARRAY_LENINT(tmp), RARRAY_CONST_PTR(tmp), str);
2373     }
2374     return rb_str_format(1, &arg, str);
2375 }
2376
2377 static inline void
2378 rb_check_lockedtmp(VALUE str)
2379 {
2380     if (FL_TEST(str, STR_TMPLOCK)) {
2381         rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
2382     }
2383 }
2384
2385 static inline void
2386 str_modifiable(VALUE str)
2387 {
2388     rb_check_lockedtmp(str);
2389     rb_check_frozen(str);
2390 }
2391
2392 static inline int
2393 str_dependent_p(VALUE str)
2394 {
2395     if (STR_EMBED_P(str) || !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2396         return 0;
2397     }
2398     else {
2399         return 1;
2400     }
2401 }
2402
2403 static inline int
2404 str_independent(VALUE str)
2405 {
2406     str_modifiable(str);
2407     return !str_dependent_p(str);
2408 }
2409
2410 static void
2411 str_make_independent_expand(VALUE str, long len, long expand, const int termlen)
2412 {
2413     char *ptr;
2414     char *oldptr;
2415     long capa = len + expand;
2416
2417     if (len > capa) len = capa;
2418
2419     if (!STR_EMBED_P(str) && str_embed_capa(str) >= capa + termlen) {
2420         ptr = RSTRING(str)->as.heap.ptr;
2421         STR_SET_EMBED(str);
2422         memcpy(RSTRING(str)->as.embed.ary, ptr, len);
2423         TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
2424         STR_SET_EMBED_LEN(str, len);
2425         return;
2426     }
2427
2428     ptr = ALLOC_N(char, (size_t)capa + termlen);
2429     oldptr = RSTRING_PTR(str);
2430     if (oldptr) {
2431         memcpy(ptr, oldptr, len);
2432     }
2433     if (FL_TEST_RAW(str, STR_NOEMBED|STR_NOFREE|STR_SHARED) == STR_NOEMBED) {
2434         xfree(oldptr);
2435     }
2436     STR_SET_NOEMBED(str);
2437     FL_UNSET(str, STR_SHARED|STR_NOFREE);
2438     TERM_FILL(ptr + len, termlen);
2439     RSTRING(str)->as.heap.ptr = ptr;
2440     RSTRING(str)->as.heap.len = len;
2441     RSTRING(str)->as.heap.aux.capa = capa;
2442 }
2443
2444 void
2445 rb_str_modify(VALUE str)
2446 {
2447     if (!str_independent(str))
2448         str_make_independent(str);
2449     ENC_CODERANGE_CLEAR(str);
2450 }
2451
2452 void
2453 rb_str_modify_expand(VALUE str, long expand)
2454 {
2455     int termlen = TERM_LEN(str);
2456     long len = RSTRING_LEN(str);
2457
2458     if (expand < 0) {
2459         rb_raise(rb_eArgError, "negative expanding string size");
2460     }
2461     if (expand >= LONG_MAX - len) {
2462         rb_raise(rb_eArgError, "string size too big");
2463     }
2464
2465     if (!str_independent(str)) {
2466         str_make_independent_expand(str, len, expand, termlen);
2467     }
2468     else if (expand > 0) {
2469         RESIZE_CAPA_TERM(str, len + expand, termlen);
2470     }
2471     ENC_CODERANGE_CLEAR(str);
2472 }
2473
2474 /* As rb_str_modify(), but don't clear coderange */
2475 static void
2476 str_modify_keep_cr(VALUE str)
2477 {
2478     if (!str_independent(str))
2479         str_make_independent(str);
2480     if (ENC_CODERANGE(str) == ENC_CODERANGE_BROKEN)
2481         /* Force re-scan later */
2482         ENC_CODERANGE_CLEAR(str);
2483 }
2484
2485 static inline void
2486 str_discard(VALUE str)
2487 {
2488     str_modifiable(str);
2489     if (!STR_EMBED_P(str) && !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2490         ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
2491         RSTRING(str)->as.heap.ptr = 0;
2492         RSTRING(str)->as.heap.len = 0;
2493     }
2494 }
2495
2496 void
2497 rb_must_asciicompat(VALUE str)
2498 {
2499     rb_encoding *enc = rb_enc_get(str);
2500     if (!rb_enc_asciicompat(enc)) {
2501         rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
2502     }
2503 }
2504
2505 VALUE
2506 rb_string_value(volatile VALUE *ptr)
2507 {
2508     VALUE s = *ptr;
2509     if (!RB_TYPE_P(s, T_STRING)) {
2510         s = rb_str_to_str(s);
2511         *ptr = s;
2512     }
2513     return s;
2514 }
2515
2516 char *
2517 rb_string_value_ptr(volatile VALUE *ptr)
2518 {
2519     VALUE str = rb_string_value(ptr);
2520     return RSTRING_PTR(str);
2521 }
2522
2523 static int
2524 zero_filled(const char *s, int n)
2525 {
2526     for (; n > 0; --n) {
2527         if (*s++) return 0;
2528     }
2529     return 1;
2530 }
2531
2532 static const char *
2533 str_null_char(const char *s, long len, const int minlen, rb_encoding *enc)
2534 {
2535     const char *e = s + len;
2536
2537     for (; s + minlen <= e; s += rb_enc_mbclen(s, e, enc)) {
2538         if (zero_filled(s, minlen)) return s;
2539     }
2540     return 0;
2541 }
2542
2543 static char *
2544 str_fill_term(VALUE str, char *s, long len, int termlen)
2545 {
2546     /* This function assumes that (capa + termlen) bytes of memory
2547      * is allocated, like many other functions in this file.
2548      */
2549     if (str_dependent_p(str)) {
2550         if (!zero_filled(s + len, termlen))
2551             str_make_independent_expand(str, len, 0L, termlen);
2552     }
2553     else {
2554         TERM_FILL(s + len, termlen);
2555         return s;
2556     }
2557     return RSTRING_PTR(str);
2558 }
2559
2560 void
2561 rb_str_change_terminator_length(VALUE str, const int oldtermlen, const int termlen)
2562 {
2563     long capa = str_capacity(str, oldtermlen) + oldtermlen;
2564     long len = RSTRING_LEN(str);
2565
2566     assert(capa >= len);
2567     if (capa - len < termlen) {
2568         rb_check_lockedtmp(str);
2569         str_make_independent_expand(str, len, 0L, termlen);
2570     }
2571     else if (str_dependent_p(str)) {
2572         if (termlen > oldtermlen)
2573             str_make_independent_expand(str, len, 0L, termlen);
2574     }
2575     else {
2576         if (!STR_EMBED_P(str)) {
2577             /* modify capa instead of realloc */
2578             assert(!FL_TEST((str), STR_SHARED));
2579             RSTRING(str)->as.heap.aux.capa = capa - termlen;
2580         }
2581         if (termlen > oldtermlen) {
2582             TERM_FILL(RSTRING_PTR(str) + len, termlen);
2583         }
2584     }
2585
2586     return;
2587 }
2588
2589 static char *
2590 str_null_check(VALUE str, int *w)
2591 {
2592     char *s = RSTRING_PTR(str);
2593     long len = RSTRING_LEN(str);
2594     rb_encoding *enc = rb_enc_get(str);
2595     const int minlen = rb_enc_mbminlen(enc);
2596
2597     if (minlen > 1) {
2598         *w = 1;
2599         if (str_null_char(s, len, minlen, enc)) {
2600             return NULL;
2601         }
2602         return str_fill_term(str, s, len, minlen);
2603     }
2604     *w = 0;
2605     if (!s || memchr(s, 0, len)) {
2606         return NULL;
2607     }
2608     if (s[len]) {
2609         s = str_fill_term(str, s, len, minlen);
2610     }
2611     return s;
2612 }
2613
2614 char *
2615 rb_str_to_cstr(VALUE str)
2616 {
2617     int w;
2618     return str_null_check(str, &w);
2619 }
2620
2621 char *
2622 rb_string_value_cstr(volatile VALUE *ptr)
2623 {
2624     VALUE str = rb_string_value(ptr);
2625     int w;
2626     char *s = str_null_check(str, &w);
2627     if (!s) {
2628         if (w) {
2629             rb_raise(rb_eArgError, "string contains null char");
2630         }
2631         rb_raise(rb_eArgError, "string contains null byte");
2632     }
2633     return s;
2634 }
2635
2636 char *
2637 rb_str_fill_terminator(VALUE str, const int newminlen)
2638 {
2639     char *s = RSTRING_PTR(str);
2640     long len = RSTRING_LEN(str);
2641     return str_fill_term(str, s, len, newminlen);
2642 }
2643
2644 VALUE
2645 rb_check_string_type(VALUE str)
2646 {
2647     str = rb_check_convert_type_with_id(str, T_STRING, "String", idTo_str);
2648     return str;
2649 }
2650
2651 /*
2652  *  call-seq:
2653  *    String.try_convert(object) -> object, new_string, or nil
2654  *
2655  *  If +object+ is a \String object, returns +object+.
2656  *
2657  *  Otherwise if +object+ responds to <tt>:to_str</tt>,
2658  *  calls <tt>object.to_str</tt> and returns the result.
2659  *
2660  *  Returns +nil+ if +object+ does not respond to <tt>:to_str</tt>.
2661  *
2662  *  Raises an exception unless <tt>object.to_str</tt> returns a \String object.
2663  */
2664 static VALUE
2665 rb_str_s_try_convert(VALUE dummy, VALUE str)
2666 {
2667     return rb_check_string_type(str);
2668 }
2669
2670 static char*
2671 str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc)
2672 {
2673     long nth = *nthp;
2674     if (rb_enc_mbmaxlen(enc) == 1) {
2675         p += nth;
2676     }
2677     else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2678         p += nth * rb_enc_mbmaxlen(enc);
2679     }
2680     else if (rb_enc_asciicompat(enc)) {
2681         const char *p2, *e2;
2682         int n;
2683
2684         while (p < e && 0 < nth) {
2685             e2 = p + nth;
2686             if (e < e2) {
2687                 *nthp = nth;
2688                 return (char *)e;
2689             }
2690             if (ISASCII(*p)) {
2691                 p2 = search_nonascii(p, e2);
2692                 if (!p2) {
2693                     nth -= e2 - p;
2694                     *nthp = nth;
2695                     return (char *)e2;
2696                 }
2697                 nth -= p2 - p;
2698                 p = p2;
2699             }
2700             n = rb_enc_mbclen(p, e, enc);
2701             p += n;
2702             nth--;
2703         }
2704         *nthp = nth;
2705         if (nth != 0) {
2706             return (char *)e;
2707         }
2708         return (char *)p;
2709     }
2710     else {
2711         while (p < e && nth--) {
2712             p += rb_enc_mbclen(p, e, enc);
2713         }
2714     }
2715     if (p > e) p = e;
2716     *nthp = nth;
2717     return (char*)p;
2718 }
2719
2720 char*
2721 rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
2722 {
2723     return str_nth_len(p, e, &nth, enc);
2724 }
2725
2726 static char*
2727 str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
2728 {
2729     if (singlebyte)
2730         p += nth;
2731     else {
2732         p = str_nth_len(p, e, &nth, enc);
2733     }
2734     if (!p) return 0;
2735     if (p > e) p = e;
2736     return (char *)p;
2737 }
2738
2739 /* char offset to byte offset */
2740 static long
2741 str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
2742 {
2743     const char *pp = str_nth(p, e, nth, enc, singlebyte);
2744     if (!pp) return e - p;
2745     return pp - p;
2746 }
2747
2748 long
2749 rb_str_offset(VALUE str, long pos)
2750 {
2751     return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
2752                       STR_ENC_GET(str), single_byte_optimizable(str));
2753 }
2754
2755 #ifdef NONASCII_MASK
2756 static char *
2757 str_utf8_nth(const char *p, const char *e, long *nthp)
2758 {
2759     long nth = *nthp;
2760     if ((int)SIZEOF_VOIDP * 2 < e - p && (int)SIZEOF_VOIDP * 2 < nth) {
2761         const uintptr_t *s, *t;
2762         const uintptr_t lowbits = SIZEOF_VOIDP - 1;
2763         s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2764         t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2765         while (p < (const char *)s) {
2766             if (is_utf8_lead_byte(*p)) nth--;
2767             p++;
2768         }
2769         do {
2770             nth -= count_utf8_lead_bytes_with_word(s);
2771             s++;
2772         } while (s < t && (int)SIZEOF_VOIDP <= nth);
2773         p = (char *)s;
2774     }
2775     while (p < e) {
2776         if (is_utf8_lead_byte(*p)) {
2777             if (nth == 0) break;
2778             nth--;
2779         }
2780         p++;
2781     }
2782     *nthp = nth;
2783     return (char *)p;
2784 }
2785
2786 static long
2787 str_utf8_offset(const char *p, const char *e, long nth)
2788 {
2789     const char *pp = str_utf8_nth(p, e, &nth);
2790     return pp - p;
2791 }
2792 #endif
2793
2794 /* byte offset to char offset */
2795 long
2796 rb_str_sublen(VALUE str, long pos)
2797 {
2798     if (single_byte_optimizable(str) || pos < 0)
2799         return pos;
2800     else {
2801         char *p = RSTRING_PTR(str);
2802         return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
2803     }
2804 }
2805
2806 VALUE
2807 rb_str_subseq(VALUE str, long beg, long len)
2808 {
2809     VALUE str2;
2810
2811     if (!STR_EMBEDDABLE_P(len, TERM_LEN(str)) &&
2812         SHARABLE_SUBSTRING_P(beg, len, RSTRING_LEN(str))) {
2813         long olen;
2814         str2 = rb_str_new_shared(rb_str_new_frozen_String(str));
2815         RSTRING(str2)->as.heap.ptr += beg;
2816         olen = RSTRING(str2)->as.heap.len;
2817         if (olen > len) RSTRING(str2)->as.heap.len = len;
2818     }
2819     else {
2820         str2 = rb_str_new(RSTRING_PTR(str)+beg, len);
2821         RB_GC_GUARD(str);
2822     }
2823
2824     rb_enc_cr_str_copy_for_substr(str2, str);
2825
2826     return str2;
2827 }
2828
2829 char *
2830 rb_str_subpos(VALUE str, long beg, long *lenp)
2831 {
2832     long len = *lenp;
2833     long slen = -1L;
2834     long blen = RSTRING_LEN(str);
2835     rb_encoding *enc = STR_ENC_GET(str);
2836     char *p, *s = RSTRING_PTR(str), *e = s + blen;
2837
2838     if (len < 0) return 0;
2839     if (!blen) {
2840         len = 0;
2841     }
2842     if (single_byte_optimizable(str)) {
2843         if (beg > blen) return 0;
2844         if (beg < 0) {
2845             beg += blen;
2846             if (beg < 0) return 0;
2847         }
2848         if (len > blen - beg)
2849             len = blen - beg;
2850         if (len < 0) return 0;
2851         p = s + beg;
2852         goto end;
2853     }
2854     if (beg < 0) {
2855         if (len > -beg) len = -beg;
2856         if (-beg * rb_enc_mbmaxlen(enc) < RSTRING_LEN(str) / 8) {
2857             beg = -beg;
2858             while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
2859             p = e;
2860             if (!p) return 0;
2861             while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
2862             if (!p) return 0;
2863             len = e - p;
2864             goto end;
2865         }
2866         else {
2867             slen = str_strlen(str, enc);
2868             beg += slen;
2869             if (beg < 0) return 0;
2870             p = s + beg;
2871             if (len == 0) goto end;
2872         }
2873     }
2874     else if (beg > 0 && beg > RSTRING_LEN(str)) {
2875         return 0;
2876     }
2877     if (len == 0) {
2878         if (beg > str_strlen(str, enc)) return 0; /* str's enc */
2879         p = s + beg;
2880     }
2881 #ifdef NONASCII_MASK
2882     else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
2883         enc == rb_utf8_encoding()) {
2884         p = str_utf8_nth(s, e, &beg);
2885         if (beg > 0) return 0;
2886         len = str_utf8_offset(p, e, len);
2887     }
2888 #endif
2889     else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2890         int char_sz = rb_enc_mbmaxlen(enc);
2891
2892         p = s + beg * char_sz;
2893         if (p > e) {
2894             return 0;
2895         }
2896         else if (len * char_sz > e - p)
2897             len = e - p;
2898         else
2899             len *= char_sz;
2900     }
2901     else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
2902         if (beg > 0) return 0;
2903         len = 0;
2904     }
2905     else {
2906         len = str_offset(p, e, len, enc, 0);
2907     }
2908   end:
2909     *lenp = len;
2910     RB_GC_GUARD(str);
2911     return p;
2912 }
2913
2914 static VALUE str_substr(VALUE str, long beg, long len, int empty);
2915
2916 VALUE
2917 rb_str_substr(VALUE str, long beg, long len)
2918 {
2919     return str_substr(str, beg, len, TRUE);
2920 }
2921
2922 static VALUE
2923 str_substr(VALUE str, long beg, long len, int empty)
2924 {
2925     VALUE str2;
2926     char *p = rb_str_subpos(str, beg, &len);
2927
2928     if (!p) return Qnil;
2929     if (!STR_EMBEDDABLE_P(len, TERM_LEN(str)) &&
2930         SHARABLE_SUBSTRING_P(p, len, RSTRING_END(str))) {
2931         long ofs = p - RSTRING_PTR(str);
2932         str2 = rb_str_new_frozen(str);
2933         str2 = str_new_shared(rb_cString, str2);
2934         RSTRING(str2)->as.heap.ptr += ofs;
2935         RSTRING(str2)->as.heap.len = len;
2936         ENC_CODERANGE_CLEAR(str2);
2937     }
2938     else {
2939         if (!len && !empty) return Qnil;
2940         str2 = rb_str_new(p, len);
2941         RB_GC_GUARD(str);
2942     }
2943     rb_enc_cr_str_copy_for_substr(str2, str);
2944
2945     return str2;
2946 }
2947
2948 VALUE
2949 rb_str_freeze(VALUE str)
2950 {
2951     if (OBJ_FROZEN(str)) return str;
2952     rb_str_resize(str, RSTRING_LEN(str));
2953     return rb_obj_freeze(str);
2954 }
2955
2956
2957 /*
2958  * call-seq:
2959  *   +string -> new_string or self
2960  *
2961  * Returns +self+ if +self+ is not frozen.
2962  *
2963  * Otherwise. returns <tt>self.dup</tt>, which is not frozen.
2964  */
2965 static VALUE
2966 str_uplus(VALUE str)
2967 {
2968     if (OBJ_FROZEN(str)) {
2969         return rb_str_dup(str);
2970     }
2971     else {
2972         return str;
2973     }
2974 }
2975
2976 /*
2977  * call-seq:
2978  *   -string -> frozen_string
2979  *
2980  * Returns a frozen, possibly pre-existing copy of the string.
2981  *
2982  * The returned \String will be deduplicated as long as it does not have
2983  * any instance variables set on it.
2984  */
2985 static VALUE
2986 str_uminus(VALUE str)
2987 {
2988     if (!BARE_STRING_P(str) && !rb_obj_frozen_p(str)) {
2989         str = rb_str_dup(str);
2990     }
2991     return rb_fstring(str);
2992 }
2993
2994 RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str))
2995 #define rb_str_dup_frozen rb_str_new_frozen
2996
2997 VALUE
2998 rb_str_locktmp(VALUE str)
2999 {
3000     if (FL_TEST(str, STR_TMPLOCK)) {
3001         rb_raise(rb_eRuntimeError, "temporal locking already locked string");
3002     }
3003     FL_SET(str, STR_TMPLOCK);
3004     return str;
3005 }
3006
3007 VALUE
3008 rb_str_unlocktmp(VALUE str)
3009 {
3010     if (!FL_TEST(str, STR_TMPLOCK)) {
3011         rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
3012     }
3013     FL_UNSET(str, STR_TMPLOCK);
3014     return str;
3015 }
3016
3017 RUBY_FUNC_EXPORTED VALUE
3018 rb_str_locktmp_ensure(VALUE str, VALUE (*func)(VALUE), VALUE arg)
3019 {
3020     rb_str_locktmp(str);
3021     return rb_ensure(func, arg, rb_str_unlocktmp, str);
3022 }
3023
3024 void
3025 rb_str_set_len(VALUE str, long len)
3026 {
3027     long capa;
3028     const int termlen = TERM_LEN(str);
3029
3030     str_modifiable(str);
3031     if (STR_SHARED_P(str)) {
3032         rb_raise(rb_eRuntimeError, "can't set length of shared string");
3033     }
3034     if (len > (capa = (long)str_capacity(str, termlen)) || len < 0) {
3035         rb_bug("probable buffer overflow: %ld for %ld", len, capa);
3036     }
3037     STR_SET_LEN(str, len);
3038     TERM_FILL(&RSTRING_PTR(str)[len], termlen);
3039 }
3040
3041 VALUE
3042 rb_str_resize(VALUE str, long len)
3043 {
3044     long slen;
3045     int independent;
3046
3047     if (len < 0) {
3048         rb_raise(rb_eArgError, "negative string size (or size too big)");
3049     }
3050
3051     independent = str_independent(str);
3052     ENC_CODERANGE_CLEAR(str);
3053     slen = RSTRING_LEN(str);
3054
3055     {
3056         long capa;
3057         const int termlen = TERM_LEN(str);
3058         if (STR_EMBED_P(str)) {
3059             if (len == slen) return str;
3060             if (str_embed_capa(str) >= len + termlen) {
3061                 STR_SET_EMBED_LEN(str, len);
3062                 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3063                 return str;
3064             }
3065             str_make_independent_expand(str, slen, len - slen, termlen);
3066         }
3067         else if (str_embed_capa(str) >= len + termlen) {
3068             char *ptr = STR_HEAP_PTR(str);
3069             STR_SET_EMBED(str);
3070             if (slen > len) slen = len;
3071             if (slen > 0) MEMCPY(RSTRING(str)->as.embed.ary, ptr, char, slen);
3072             TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3073             STR_SET_EMBED_LEN(str, len);
3074             if (independent) ruby_xfree(ptr);
3075             return str;
3076         }
3077         else if (!independent) {
3078             if (len == slen) return str;
3079             str_make_independent_expand(str, slen, len - slen, termlen);
3080         }
3081         else if ((capa = RSTRING(str)->as.heap.aux.capa) < len ||
3082                  (capa - len) > (len < 1024 ? len : 1024)) {
3083             SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
3084                             (size_t)len + termlen, STR_HEAP_SIZE(str));
3085             RSTRING(str)->as.heap.aux.capa = len;
3086         }
3087         else if (len == slen) return str;
3088         RSTRING(str)->as.heap.len = len;
3089         TERM_FILL(RSTRING(str)->as.heap.ptr + len, termlen); /* sentinel */
3090     }
3091     return str;
3092 }
3093
3094 static VALUE
3095 str_buf_cat(VALUE str, const char *ptr, long len)
3096 {
3097     long capa, total, olen, off = -1;
3098     char *sptr;
3099     const int termlen = TERM_LEN(str);
3100 #if !USE_RVARGC
3101     assert(termlen < RSTRING_EMBED_LEN_MAX + 1); /* < (LONG_MAX/2) */
3102 #endif
3103
3104     RSTRING_GETMEM(str, sptr, olen);
3105     if (ptr >= sptr && ptr <= sptr + olen) {
3106         off = ptr - sptr;
3107     }
3108     rb_str_modify(str);
3109     if (len == 0) return 0;
3110     if (STR_EMBED_P(str)) {
3111         capa = str_embed_capa(str) - termlen;
3112         sptr = RSTRING(str)->as.embed.ary;
3113         olen = RSTRING_EMBED_LEN(str);
3114     }
3115     else {
3116         capa = RSTRING(str)->as.heap.aux.capa;
3117         sptr = RSTRING(str)->as.heap.ptr;
3118         olen = RSTRING(str)->as.heap.len;
3119     }
3120     if (olen > LONG_MAX - len) {
3121         rb_raise(rb_eArgError, "string sizes too big");
3122     }
3123     total = olen + len;
3124     if (capa < total) {
3125         if (total >= LONG_MAX / 2) {
3126             capa = total;
3127         }
3128         while (total > capa) {
3129             capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3130         }
3131         RESIZE_CAPA_TERM(str, capa, termlen);
3132         sptr = RSTRING_PTR(str);
3133     }
3134     if (off != -1) {
3135         ptr = sptr + off;
3136     }
3137     memcpy(sptr + olen, ptr, len);
3138     STR_SET_LEN(str, total);
3139     TERM_FILL(sptr + total, termlen); /* sentinel */
3140
3141     return str;
3142 }
3143
3144 #define str_buf_cat2(str, ptr) str_buf_cat((str), (ptr), strlen(ptr))
3145
3146 VALUE
3147 rb_str_cat(VALUE str, const char *ptr, long len)
3148 {
3149     if (len == 0) return str;
3150     if (len < 0) {
3151         rb_raise(rb_eArgError, "negative string size (or size too big)");
3152     }
3153     return str_buf_cat(str, ptr, len);
3154 }
3155
3156 VALUE
3157 rb_str_cat_cstr(VALUE str, const char *ptr)
3158 {
3159     must_not_null(ptr);
3160     return rb_str_buf_cat(str, ptr, strlen(ptr));
3161 }
3162
3163 RUBY_ALIAS_FUNCTION(rb_str_buf_cat(VALUE str, const char *ptr, long len), rb_str_cat, (str, ptr, len))
3164 RUBY_ALIAS_FUNCTION(rb_str_buf_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3165 RUBY_ALIAS_FUNCTION(rb_str_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3166
3167 static VALUE
3168 rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
3169     int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
3170 {
3171     int str_encindex = ENCODING_GET(str);
3172     int res_encindex;
3173     int str_cr, res_cr;
3174     rb_encoding *str_enc, *ptr_enc;
3175
3176     str_cr = RSTRING_LEN(str) ? ENC_CODERANGE(str) : ENC_CODERANGE_7BIT;
3177
3178     if (str_encindex == ptr_encindex) {
3179         if (str_cr != ENC_CODERANGE_UNKNOWN && ptr_cr == ENC_CODERANGE_UNKNOWN) {
3180             ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
3181         }
3182     }
3183     else {
3184         str_enc = rb_enc_from_index(str_encindex);
3185         ptr_enc = rb_enc_from_index(ptr_encindex);
3186         if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
3187             if (len == 0)
3188                 return str;
3189             if (RSTRING_LEN(str) == 0) {
3190                 rb_str_buf_cat(str, ptr, len);
3191                 ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
3192                 rb_str_change_terminator_length(str, rb_enc_mbminlen(str_enc), rb_enc_mbminlen(ptr_enc));
3193                 return str;
3194             }
3195             goto incompatible;
3196         }
3197         if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
3198             ptr_cr = coderange_scan(ptr, len, ptr_enc);
3199         }
3200         if (str_cr == ENC_CODERANGE_UNKNOWN) {
3201             if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) {
3202                 str_cr = rb_enc_str_coderange(str);
3203             }
3204         }
3205     }
3206     if (ptr_cr_ret)
3207         *ptr_cr_ret = ptr_cr;
3208
3209     if (str_encindex != ptr_encindex &&
3210         str_cr != ENC_CODERANGE_7BIT &&
3211         ptr_cr != ENC_CODERANGE_7BIT) {
3212         str_enc = rb_enc_from_index(str_encindex);
3213         ptr_enc = rb_enc_from_index(ptr_encindex);
3214         goto incompatible;
3215     }
3216
3217     if (str_cr == ENC_CODERANGE_UNKNOWN) {
3218         res_encindex = str_encindex;
3219         res_cr = ENC_CODERANGE_UNKNOWN;
3220     }
3221     else if (str_cr == ENC_CODERANGE_7BIT) {
3222         if (ptr_cr == ENC_CODERANGE_7BIT) {
3223             res_encindex = str_encindex;
3224             res_cr = ENC_CODERANGE_7BIT;
3225         }
3226         else {
3227             res_encindex = ptr_encindex;
3228             res_cr = ptr_cr;
3229         }
3230     }
3231     else if (str_cr == ENC_CODERANGE_VALID) {
3232         res_encindex = str_encindex;
3233         if (ENC_CODERANGE_CLEAN_P(ptr_cr))
3234             res_cr = str_cr;
3235         else
3236             res_cr = ptr_cr;
3237     }
3238     else { /* str_cr == ENC_CODERANGE_BROKEN */
3239         res_encindex = str_encindex;
3240         res_cr = str_cr;
3241         if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
3242     }
3243
3244     if (len < 0) {
3245         rb_raise(rb_eArgError, "negative string size (or size too big)");
3246     }
3247     str_buf_cat(str, ptr, len);
3248     ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
3249     return str;
3250
3251   incompatible:
3252     rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
3253              rb_enc_name(str_enc), rb_enc_name(ptr_enc));
3254     UNREACHABLE_RETURN(Qundef);
3255 }
3256
3257 VALUE
3258 rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
3259 {
3260     return rb_enc_cr_str_buf_cat(str, ptr, len,
3261         rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL);
3262 }
3263
3264 VALUE
3265 rb_str_buf_cat_ascii(VALUE str, const char *ptr)
3266 {
3267     /* ptr must reference NUL terminated ASCII string. */
3268     int encindex = ENCODING_GET(str);
3269     rb_encoding *enc = rb_enc_from_index(encindex);
3270     if (rb_enc_asciicompat(enc)) {
3271         return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
3272             encindex, ENC_CODERANGE_7BIT, 0);
3273     }
3274     else {
3275         char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
3276         while (*ptr) {
3277             unsigned int c = (unsigned char)*ptr;
3278             int len = rb_enc_codelen(c, enc);
3279             rb_enc_mbcput(c, buf, enc);
3280             rb_enc_cr_str_buf_cat(str, buf, len,
3281                 encindex, ENC_CODERANGE_VALID, 0);
3282             ptr++;
3283         }
3284         return str;
3285     }
3286 }
3287
3288 VALUE
3289 rb_str_buf_append(VALUE str, VALUE str2)
3290 {
3291     int str2_cr;
3292
3293     str2_cr = ENC_CODERANGE(str2);
3294
3295     rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
3296         ENCODING_GET(str2), str2_cr, &str2_cr);
3297
3298     ENC_CODERANGE_SET(str2, str2_cr);
3299
3300     return str;
3301 }
3302
3303 VALUE
3304 rb_str_append(VALUE str, VALUE str2)
3305 {
3306     StringValue(str2);
3307     return rb_str_buf_append(str, str2);
3308 }
3309
3310 #define MIN_PRE_ALLOC_SIZE 48
3311
3312 MJIT_FUNC_EXPORTED VALUE
3313 rb_str_concat_literals(size_t num, const VALUE *strary)
3314 {
3315     VALUE str;
3316     size_t i, s;
3317     long len = 1;
3318
3319     if (UNLIKELY(!num)) return rb_str_new(0, 0);
3320     if (UNLIKELY(num == 1)) return rb_str_resurrect(strary[0]);
3321
3322     for (i = 0; i < num; ++i) { len += RSTRING_LEN(strary[i]); }
3323     if (LIKELY(len < MIN_PRE_ALLOC_SIZE)) {
3324         str = rb_str_resurrect(strary[0]);
3325         s = 1;
3326     }
3327     else {
3328         str = rb_str_buf_new(len);
3329         rb_enc_copy(str, strary[0]);
3330         s = 0;
3331     }
3332
3333     for (i = s; i < num; ++i) {
3334         const VALUE v = strary[i];
3335         int encidx = ENCODING_GET(v);
3336
3337         rb_enc_cr_str_buf_cat(str, RSTRING_PTR(v), RSTRING_LEN(v),
3338                               encidx, ENC_CODERANGE(v), NULL);
3339         if (encidx != ENCINDEX_US_ASCII) {
3340             if (ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)
3341                 rb_enc_set_index(str, encidx);
3342         }
3343     }
3344     return str;
3345 }
3346
3347 /*
3348  *  call-seq:
3349  *     concat(*objects) -> string
3350  *
3351  *  Concatenates each object in +objects+ to +self+ and returns +self+:
3352  *
3353  *    s = 'foo'
3354  *    s.concat('bar', 'baz') # => "foobarbaz"
3355  *    s                      # => "foobarbaz"
3356  *
3357  *  For each given object +object+ that is an \Integer,
3358  *  the value is considered a codepoint and converted to a character before concatenation:
3359  *
3360  *    s = 'foo'
3361  *    s.concat(32, 'bar', 32, 'baz') # => "foo bar baz"
3362  *
3363  *  Related: String#<<, which takes a single argument.
3364  */
3365 static VALUE
3366 rb_str_concat_multi(int argc, VALUE *argv, VALUE str)
3367 {
3368     str_modifiable(str);
3369
3370     if (argc == 1) {
3371         return rb_str_concat(str, argv[0]);
3372     }
3373     else if (argc > 1) {
3374         int i;
3375         VALUE arg_str = rb_str_tmp_new(0);
3376         rb_enc_copy(arg_str, str);
3377         for (i = 0; i < argc; i++) {
3378             rb_str_concat(arg_str, argv[i]);
3379         }
3380         rb_str_buf_append(str, arg_str);
3381     }
3382
3383     return str;
3384 }
3385
3386 /*
3387  *  call-seq:
3388  *    string << object -> string
3389  *
3390  *  Concatenates +object+ to +self+ and returns +self+:
3391  *
3392  *    s = 'foo'
3393  *    s << 'bar' # => "foobar"
3394  *    s          # => "foobar"
3395  *
3396  *  If +object+ is an \Integer,
3397  *  the value is considered a codepoint and converted to a character before concatenation:
3398  *
3399  *    s = 'foo'
3400  *    s << 33 # => "foo!"
3401  *
3402  *  Related: String#concat, which takes multiple arguments.
3403  */
3404 VALUE
3405 rb_str_concat(VALUE str1, VALUE str2)
3406 {
3407     unsigned int code;
3408     rb_encoding *enc = STR_ENC_GET(str1);
3409     int encidx;
3410
3411     if (RB_INTEGER_TYPE_P(str2)) {
3412         if (rb_num_to_uint(str2, &code) == 0) {
3413         }
3414         else if (FIXNUM_P(str2)) {
3415             rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2));
3416         }
3417         else {
3418             rb_raise(rb_eRangeError, "bignum out of char range");
3419         }
3420     }
3421     else {
3422         return rb_str_append(str1, str2);
3423     }
3424
3425     encidx = rb_enc_to_index(enc);
3426     if (encidx == ENCINDEX_ASCII || encidx == ENCINDEX_US_ASCII) {
3427         /* US-ASCII automatically extended to ASCII-8BIT */
3428         char buf[1];
3429         buf[0] = (char)code;
3430         if (code > 0xFF) {
3431             rb_raise(rb_eRangeError, "%u out of char range", code);
3432         }
3433         rb_str_cat(str1, buf, 1);
3434         if (encidx == ENCINDEX_US_ASCII && code > 127) {
3435             rb_enc_associate_index(str1, ENCINDEX_ASCII);
3436             ENC_CODERANGE_SET(str1, ENC_CODERANGE_VALID);
3437         }
3438     }
3439     else {
3440         long pos = RSTRING_LEN(str1);
3441         int cr = ENC_CODERANGE(str1);
3442         int len;
3443         char *buf;
3444
3445         switch (len = rb_enc_codelen(code, enc)) {
3446           case ONIGERR_INVALID_CODE_POINT_VALUE:
3447             rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
3448             break;
3449           case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
3450           case 0:
3451             rb_raise(rb_eRangeError, "%u out of char range", code);
3452             break;
3453         }
3454         buf = ALLOCA_N(char, len + 1);
3455         rb_enc_mbcput(code, buf, enc);
3456         if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) {
3457             rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
3458         }
3459         rb_str_resize(str1, pos+len);
3460         memcpy(RSTRING_PTR(str1) + pos, buf, len);
3461         if (cr == ENC_CODERANGE_7BIT && code > 127)
3462             cr = ENC_CODERANGE_VALID;
3463         ENC_CODERANGE_SET(str1, cr);
3464     }
3465     return str1;
3466 }
3467
3468 /*
3469  *  call-seq:
3470  *    prepend(*other_strings)  -> string
3471  *
3472  *  Prepends each string in +other_strings+ to +self+ and returns +self+:
3473  *
3474  *    s = 'foo'
3475  *    s.prepend('bar', 'baz') # => "barbazfoo"
3476  *    s                       # => "barbazfoo"
3477  *
3478  *  Related: String#concat.
3479  */
3480
3481 static VALUE
3482 rb_str_prepend_multi(int argc, VALUE *argv, VALUE str)
3483 {
3484     str_modifiable(str);
3485
3486     if (argc == 1) {
3487         rb_str_update(str, 0L, 0L, argv[0]);
3488     }
3489     else if (argc > 1) {
3490         int i;
3491         VALUE arg_str = rb_str_tmp_new(0);
3492         rb_enc_copy(arg_str, str);
3493         for (i = 0; i < argc; i++) {
3494             rb_str_append(arg_str, argv[i]);
3495         }
3496         rb_str_update(str, 0L, 0L, arg_str);
3497     }
3498
3499     return str;
3500 }
3501
3502 st_index_t
3503 rb_str_hash(VALUE str)
3504 {
3505     int e = ENCODING_GET(str);
3506     if (e && rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT) {
3507         e = 0;
3508     }
3509     return rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str)) ^ e;
3510 }
3511
3512 int
3513 rb_str_hash_cmp(VALUE str1, VALUE str2)
3514 {
3515     long len1, len2;
3516     const char *ptr1, *ptr2;
3517     RSTRING_GETMEM(str1, ptr1, len1);
3518     RSTRING_GETMEM(str2, ptr2, len2);
3519     return (len1 != len2 ||
3520             !rb_str_comparable(str1, str2) ||
3521             memcmp(ptr1, ptr2, len1) != 0);
3522 }
3523
3524 /*
3525  * call-seq:
3526  *   hash -> integer
3527  *
3528  * Returns the integer hash value for +self+.
3529  * The value is based on the length, content and encoding of +self+.
3530  *
3531  * Related: Object#hash.
3532  */
3533
3534 static VALUE
3535 rb_str_hash_m(VALUE str)
3536 {
3537     st_index_t hval = rb_str_hash(str);
3538     return ST2FIX(hval);
3539 }
3540
3541 #define lesser(a,b) (((a)>(b))?(b):(a))
3542
3543 int
3544 rb_str_comparable(VALUE str1, VALUE str2)
3545 {
3546     int idx1, idx2;
3547     int rc1, rc2;
3548
3549     if (RSTRING_LEN(str1) == 0) return TRUE;
3550     if (RSTRING_LEN(str2) == 0) return TRUE;
3551     idx1 = ENCODING_GET(str1);
3552     idx2 = ENCODING_GET(str2);
3553     if (idx1 == idx2) return TRUE;
3554     rc1 = rb_enc_str_coderange(str1);
3555     rc2 = rb_enc_str_coderange(str2);
3556     if (rc1 == ENC_CODERANGE_7BIT) {
3557         if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
3558         if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
3559             return TRUE;
3560     }
3561     if (rc2 == ENC_CODERANGE_7BIT) {
3562         if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
3563             return TRUE;
3564     }
3565     return FALSE;
3566 }
3567
3568 int
3569 rb_str_cmp(VALUE str1, VALUE str2)
3570 {
3571     long len1, len2;
3572     const char *ptr1, *ptr2;
3573     int retval;
3574
3575     if (str1 == str2) return 0;
3576     RSTRING_GETMEM(str1, ptr1, len1);
3577     RSTRING_GETMEM(str2, ptr2, len2);
3578     if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
3579         if (len1 == len2) {
3580             if (!rb_str_comparable(str1, str2)) {
3581                 if (ENCODING_GET(str1) > ENCODING_GET(str2))
3582                     return 1;
3583                 return -1;
3584             }
3585             return 0;
3586         }
3587         if (len1 > len2) return 1;
3588         return -1;
3589     }
3590     if (retval > 0) return 1;
3591     return -1;
3592 }
3593
3594 /*
3595  *  call-seq:
3596  *    string == object -> true or false
3597  *    string === object -> true or false
3598  *
3599  *  Returns +true+ if +object+ has the same length and content;
3600  *  as +self+; +false+ otherwise:
3601  *
3602  *    s = 'foo'
3603  *    s == 'foo' # => true
3604  *    s == 'food' # => false
3605  *    s == 'FOO' # => false
3606  *
3607  *  Returns +false+ if the two strings' encodings are not compatible:
3608  *    "\u{e4 f6 fc}".encode("ISO-8859-1") == ("\u{c4 d6 dc}") # => false
3609  *
3610  *  If +object+ is not an instance of \String but responds to +to_str+, then the
3611  *  two strings are compared using <code>object.==</code>.
3612  */
3613
3614 VALUE
3615 rb_str_equal(VALUE str1, VALUE str2)
3616 {
3617     if (str1 == str2) return Qtrue;
3618     if (!RB_TYPE_P(str2, T_STRING)) {
3619         if (!rb_respond_to(str2, idTo_str)) {
3620             return Qfalse;
3621         }
3622         return rb_equal(str2, str1);
3623     }
3624     return rb_str_eql_internal(str1, str2);
3625 }
3626
3627 /*
3628  * call-seq:
3629  *   eql?(object) -> true or false
3630  *
3631  *  Returns +true+ if +object+ has the same length and content;
3632  *  as +self+; +false+ otherwise:
3633  *
3634  *    s = 'foo'
3635  *    s.eql?('foo') # => true
3636  *    s.eql?('food') # => false
3637  *    s.eql?('FOO') # => false
3638  *
3639  *  Returns +false+ if the two strings' encodings are not compatible:
3640  *
3641  *    "\u{e4 f6 fc}".encode("ISO-8859-1").eql?("\u{c4 d6 dc}") # => false
3642  *
3643  */
3644
3645 MJIT_FUNC_EXPORTED VALUE
3646 rb_str_eql(VALUE str1, VALUE str2)
3647 {
3648     if (str1 == str2) return Qtrue;
3649     if (!RB_TYPE_P(str2, T_STRING)) return Qfalse;
3650     return rb_str_eql_internal(str1, str2);
3651 }
3652
3653 /*
3654  *  call-seq:
3655  *    string <=> other_string -> -1, 0, 1, or nil
3656  *
3657  *  Compares +self+ and +other_string+, returning:
3658  *
3659  *  - -1 if +other_string+ is larger.
3660  *  - 0 if the two are equal.
3661  *  - 1 if +other_string+ is smaller.
3662  *  - +nil+ if the two are incomparable.
3663  *
3664  *  Examples:
3665  *
3666  *    'foo' <=> 'foo' # => 0
3667  *    'foo' <=> 'food' # => -1
3668  *    'food' <=> 'foo' # => 1
3669  *    'FOO' <=> 'foo' # => -1
3670  *    'foo' <=> 'FOO' # => 1
3671  *    'foo' <=> 1 # => nil
3672  *
3673  */
3674
3675 static VALUE
3676 rb_str_cmp_m(VALUE str1, VALUE str2)
3677 {
3678     int result;
3679     VALUE s = rb_check_string_type(str2);
3680     if (NIL_P(s)) {
3681         return rb_invcmp(str1, str2);
3682     }
3683     result = rb_str_cmp(str1, s);
3684     return INT2FIX(result);
3685 }
3686
3687 static VALUE str_casecmp(VALUE str1, VALUE str2);
3688 static VALUE str_casecmp_p(VALUE str1, VALUE str2);
3689
3690 /*
3691  *  call-seq:
3692  *    casecmp(other_string) -> -1, 0, 1, or nil
3693  *
3694  *  Compares <tt>self.downcase</tt> and <tt>other_string.downcase</tt>; returns:
3695  *
3696  *  - -1 if <tt>other_string.downcase</tt> is larger.
3697  *  - 0 if the two are equal.
3698  *  - 1 if <tt>other_string.downcase</tt> is smaller.
3699  *  - +nil+ if the two are incomparable.
3700  *
3701  *  Examples:
3702  *
3703  *    'foo'.casecmp('foo') # => 0
3704  *    'foo'.casecmp('food') # => -1
3705  *    'food'.casecmp('foo') # => 1
3706  *    'FOO'.casecmp('foo') # => 0
3707  *    'foo'.casecmp('FOO') # => 0
3708  *    'foo'.casecmp(1) # => nil
3709  *
3710  *  See {Case Mapping}[doc/case_mapping_rdoc.html].
3711  *
3712  *  Related: String#casecmp?.
3713  *
3714  */
3715
3716 static VALUE
3717 rb_str_casecmp(VALUE str1, VALUE str2)
3718 {
3719     VALUE s = rb_check_string_type(str2);
3720     if (NIL_P(s)) {
3721         return Qnil;
3722     }
3723     return str_casecmp(str1, s);
3724 }
3725
3726 static VALUE
3727 str_casecmp(VALUE str1, VALUE str2)
3728 {
3729     long len;
3730     rb_encoding *enc;
3731     const char *p1, *p1end, *p2, *p2end;
3732
3733     enc = rb_enc_compatible(str1, str2);
3734     if (!enc) {
3735         return Qnil;
3736     }
3737
3738     p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
3739     p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
3740     if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
3741         while (p1 < p1end && p2 < p2end) {
3742             if (*p1 != *p2) {
3743                 unsigned int c1 = TOLOWER(*p1 & 0xff);
3744                 unsigned int c2 = TOLOWER(*p2 & 0xff);
3745                 if (c1 != c2)
3746                     return INT2FIX(c1 < c2 ? -1 : 1);
3747             }
3748             p1++;
3749             p2++;
3750         }
3751     }
3752     else {
3753         while (p1 < p1end && p2 < p2end) {
3754             int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
3755             int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
3756
3757             if (0 <= c1 && 0 <= c2) {
3758                 c1 = TOLOWER(c1);
3759                 c2 = TOLOWER(c2);
3760                 if (c1 != c2)
3761                     return INT2FIX(c1 < c2 ? -1 : 1);
3762             }
3763             else {
3764                 int r;
3765                 l1 = rb_enc_mbclen(p1, p1end, enc);
3766                 l2 = rb_enc_mbclen(p2, p2end, enc);
3767                 len = l1 < l2 ? l1 : l2;
3768                 r = memcmp(p1, p2, len);
3769                 if (r != 0)
3770                     return INT2FIX(r < 0 ? -1 : 1);
3771                 if (l1 != l2)
3772                     return INT2FIX(l1 < l2 ? -1 : 1);
3773             }
3774             p1 += l1;
3775             p2 += l2;
3776         }
3777     }
3778     if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0);
3779     if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1);
3780     return INT2FIX(-1);
3781 }
3782
3783 /*
3784  *  call-seq:
3785  *    casecmp?(other_string) -> true, false, or nil
3786  *
3787  *  Returns +true+ if +self+ and +other_string+ are equal after
3788  *  Unicode case folding, otherwise +false+:
3789  *
3790  *    'foo'.casecmp?('foo') # => true
3791  *    'foo'.casecmp?('food') # => false
3792  *    'food'.casecmp?('foo') # => false
3793  *    'FOO'.casecmp?('foo') # => true
3794  *    'foo'.casecmp?('FOO') # => true
3795  *
3796  *  Returns +nil+ if the two values are incomparable:
3797  *
3798  *    'foo'.casecmp?(1) # => nil
3799  *
3800  *  See {Case Mapping}[doc/case_mapping_rdoc.html].
3801  *
3802  *  Related: String#casecmp.
3803  *
3804  */
3805
3806 static VALUE
3807 rb_str_casecmp_p(VALUE str1, VALUE str2)
3808 {
3809     VALUE s = rb_check_string_type(str2);
3810     if (NIL_P(s)) {
3811         return Qnil;
3812     }
3813     return str_casecmp_p(str1, s);
3814 }
3815
3816 static VALUE
3817 str_casecmp_p(VALUE str1, VALUE str2)
3818 {
3819     rb_encoding *enc;
3820     VALUE folded_str1, folded_str2;
3821     VALUE fold_opt = sym_fold;
3822
3823     enc = rb_enc_compatible(str1, str2);
3824     if (!enc) {
3825         return Qnil;
3826     }
3827
3828     folded_str1 = rb_str_downcase(1, &fold_opt, str1);
3829     folded_str2 = rb_str_downcase(1, &fold_opt, str2);
3830
3831     return rb_str_eql(folded_str1, folded_str2);
3832 }
3833
3834 static long
3835 strseq_core(const char *str_ptr, const char *str_ptr_end, long str_len,
3836             const char *sub_ptr, long sub_len, long offset, rb_encoding *enc)
3837 {
3838     const char *search_start = str_ptr;
3839     long pos, search_len = str_len - offset;
3840
3841     for (;;) {
3842         const char *t;
3843         pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
3844         if (pos < 0) return pos;
3845         t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc);
3846         if (t == search_start + pos) break;
3847         search_len -= t - search_start;
3848         if (search_len <= 0) return -1;
3849         offset += t - search_start;
3850         search_start = t;
3851     }
3852     return pos + offset;
3853 }
3854
3855 #define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
3856
3857 static long
3858 rb_strseq_index(VALUE str, VALUE sub, long offset, int in_byte)
3859 {
3860     const char *str_ptr, *str_ptr_end, *sub_ptr;
3861     long str_len, sub_len;
3862     rb_encoding *enc;
3863
3864     enc = rb_enc_check(str, sub);
3865     if (is_broken_string(sub)) return -1;
3866
3867     str_ptr = RSTRING_PTR(str);
3868     str_ptr_end = RSTRING_END(str);
3869     str_len = RSTRING_LEN(str);
3870     sub_ptr = RSTRING_PTR(sub);
3871     sub_len = RSTRING_LEN(sub);
3872
3873     if (str_len < sub_len) return -1;
3874
3875     if (offset != 0) {
3876         long str_len_char, sub_len_char;
3877         int single_byte = single_byte_optimizable(str);
3878         str_len_char = (in_byte || single_byte) ? str_len : str_strlen(str, enc);
3879         sub_len_char = in_byte ? sub_len : str_strlen(sub, enc);
3880         if (offset < 0) {
3881             offset += str_len_char;
3882             if (offset < 0) return -1;
3883         }
3884         if (str_len_char - offset < sub_len_char) return -1;
3885         if (!in_byte) offset = str_offset(str_ptr, str_ptr_end, offset, enc, single_byte);
3886         str_ptr += offset;
3887     }
3888     if (sub_len == 0) return offset;
3889
3890     /* need proceed one character at a time */
3891     return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc);
3892 }
3893
3894
3895 /*
3896  *  call-seq:
3897  *    index(substring, offset = 0) -> integer or nil
3898  *    index(regexp, offset = 0) -> integer or nil
3899  *
3900  *  Returns the \Integer index of the first occurrence of the given +substring+,
3901  *  or +nil+ if none found:
3902  *
3903  *    'foo'.index('f') # => 0
3904  *    'foo'.index('o') # => 1
3905  *    'foo'.index('oo') # => 1
3906  *    'foo'.index('ooo') # => nil
3907  *
3908  *  Returns the \Integer index of the first match for the given \Regexp +regexp+,
3909  *  or +nil+ if none found:
3910  *
3911  *    'foo'.index(/f/) # => 0
3912  *    'foo'.index(/o/) # => 1
3913  *    'foo'.index(/oo/) # => 1
3914  *    'foo'.index(/ooo/) # => nil
3915  *
3916  *  \Integer argument +offset+, if given, specifies the position in the
3917  *  string to begin the search:
3918  *
3919  *    'foo'.index('o', 1) # => 1
3920  *    'foo'.index('o', 2) # => 2
3921  *    'foo'.index('o', 3) # => nil
3922  *
3923  *  If +offset+ is negative, counts backward from the end of +self+:
3924  *
3925  *    'foo'.index('o', -1) # => 2
3926  *    'foo'.index('o', -2) # => 1
3927  *    'foo'.index('o', -3) # => 1
3928  *    'foo'.index('o', -4) # => nil
3929  *
3930  *  Related: String#rindex.
3931  */
3932
3933 static VALUE
3934 rb_str_index_m(int argc, VALUE *argv, VALUE str)
3935 {
3936     VALUE sub;
3937     VALUE initpos;
3938     long pos;
3939
3940     if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
3941         pos = NUM2LONG(initpos);
3942     }
3943     else {
3944         pos = 0;
3945     }
3946     if (pos < 0) {
3947         pos += str_strlen(str, NULL);
3948         if (pos < 0) {
3949             if (RB_TYPE_P(sub, T_REGEXP)) {
3950                 rb_backref_set(Qnil);
3951             }
3952             return Qnil;
3953         }
3954     }
3955
3956     if (RB_TYPE_P(sub, T_REGEXP)) {
3957         if (pos > str_strlen(str, NULL))
3958             return Qnil;
3959         pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
3960                          rb_enc_check(str, sub), single_byte_optimizable(str));
3961
3962         if (rb_reg_search(sub, str, pos, 0) < 0) {
3963             return Qnil;
3964         }
3965         else {
3966             VALUE match = rb_backref_get();
3967             struct re_registers *regs = RMATCH_REGS(match);
3968             pos = rb_str_sublen(str, BEG(0));
3969             return LONG2NUM(pos);
3970         }
3971     }
3972     else {
3973         StringValue(sub);
3974         pos = rb_str_index(str, sub, pos);
3975         pos = rb_str_sublen(str, pos);
3976     }
3977
3978     if (pos == -1) return Qnil;
3979     return LONG2NUM(pos);
3980 }
3981
3982 #ifdef HAVE_MEMRCHR
3983 static long
3984 str_rindex(VALUE str, VALUE sub, const char *s, long pos, rb_encoding *enc)
3985 {
3986     char *hit, *adjusted;
3987     int c;
3988     long slen, searchlen;
3989     char *sbeg, *e, *t;
3990
3991     slen = RSTRING_LEN(sub);
3992     if (slen == 0) return pos;
3993     sbeg = RSTRING_PTR(str);
3994     e = RSTRING_END(str);
3995     t = RSTRING_PTR(sub);
3996     c = *t & 0xff;
3997     searchlen = s - sbeg + 1;
3998
3999     do {
4000         hit = memrchr(sbeg, c, searchlen);
4001         if (!hit) break;
4002         adjusted = rb_enc_left_char_head(sbeg, hit, e, enc);
4003         if (hit != adjusted) {
4004             searchlen = adjusted - sbeg;
4005             continue;
4006         }
4007         if (memcmp(hit, t, slen) == 0)
4008             return rb_str_sublen(str, hit - sbeg);
4009         searchlen = adjusted - sbeg;
4010     } while (searchlen > 0);
4011
4012     return -1;
4013 }
4014 #else
4015 static long
4016 str_rindex(VALUE str, VALUE sub, const char *s, long pos, rb_encoding *enc)
4017 {
4018     long slen;
4019     char *sbeg, *e, *t;
4020
4021     sbeg = RSTRING_PTR(str);
4022     e = RSTRING_END(str);
4023     t = RSTRING_PTR(sub);
4024     slen = RSTRING_LEN(sub);
4025
4026     while (s) {
4027         if (memcmp(s, t, slen) == 0) {
4028             return pos;
4029         }
4030         if (pos == 0) break;
4031         pos--;
4032         s = rb_enc_prev_char(sbeg, s, e, enc);
4033     }
4034
4035     return -1;
4036 }
4037 #endif
4038
4039 static long
4040 rb_str_rindex(VALUE str, VALUE sub, long pos)
4041 {
4042     long len, slen;
4043     char *sbeg, *s;
4044     rb_encoding *enc;
4045     int singlebyte;
4046
4047     enc = rb_enc_check(str, sub);
4048     if (is_broken_string(sub)) return -1;
4049     singlebyte = single_byte_optimizable(str);
4050     len = singlebyte ? RSTRING_LEN(str) : str_strlen(str, enc); /* rb_enc_check */
4051     slen = str_strlen(sub, enc); /* rb_enc_check */
4052
4053     /* substring longer than string */
4054     if (len < slen) return -1;
4055     if (len - pos < slen) pos = len - slen;
4056     if (len == 0) return pos;
4057
4058     sbeg = RSTRING_PTR(str);
4059
4060     if (pos == 0) {
4061         if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4062             return 0;
4063         else
4064             return -1;
4065     }
4066
4067     s = str_nth(sbeg, RSTRING_END(str), pos, enc, singlebyte);
4068     return str_rindex(str, sub, s, pos, enc);
4069 }
4070
4071 /*
4072  *  call-seq:
4073  *    rindex(substring, offset = self.length) -> integer or nil
4074  *    rindex(regexp, offset = self.length) -> integer or nil
4075  *
4076  *  Returns the \Integer index of the _last_ occurrence of the given +substring+,
4077  *  or +nil+ if none found:
4078  *
4079  *    'foo'.rindex('f') # => 0
4080  *    'foo'.rindex('o') # => 2
4081  *    'foo'.rindex('oo') # => 1
4082  *    'foo'.rindex('ooo') # => nil
4083  *
4084  *  Returns the \Integer index of the _last_ match for the given \Regexp +regexp+,
4085  *  or +nil+ if none found:
4086  *
4087  *    'foo'.rindex(/f/) # => 0
4088  *    'foo'.rindex(/o/) # => 2
4089  *    'foo'.rindex(/oo/) # => 1
4090  *    'foo'.rindex(/ooo/) # => nil
4091  *
4092  *  The _last_ match means starting at the possible last position, not
4093  *  the last of longest matches.
4094  *
4095  *    'foo'.rindex(/o+/) # => 2
4096  *    $~ #=> #<MatchData "o">
4097  *
4098  *  To get the last longest match, needs to combine with negative
4099  *  lookbehind.
4100  *
4101  *    'foo'.rindex(/(?<!o)o+/) # => 1
4102  *    $~ #=> #<MatchData "oo">
4103  *
4104  *  Or String#index with negative lookforward.
4105  *
4106  *    'foo'.index(/o+(?!.*o)/) # => 1
4107  *    $~ #=> #<MatchData "oo">
4108  *
4109  *  \Integer argument +offset+, if given and non-negative, specifies the maximum starting position in the
4110  *   string to _end_ the search:
4111  *
4112  *    'foo'.rindex('o', 0) # => nil
4113  *    'foo'.rindex('o', 1) # => 1
4114  *    'foo'.rindex('o', 2) # => 2
4115  *    'foo'.rindex('o', 3) # => 2
4116  *
4117  *  If +offset+ is a negative \Integer, the maximum starting position in the
4118  *  string to _end_ the search is the sum of the string's length and +offset+:
4119  *
4120  *    'foo'.rindex('o', -1) # => 2
4121  *    'foo'.rindex('o', -2) # => 1
4122  *    'foo'.rindex('o', -3) # => nil
4123  *    'foo'.rindex('o', -4) # => nil
4124  *
4125  *  Related: String#index.
4126  */
4127
4128 static VALUE
4129 rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
4130 {
4131     VALUE sub;
4132     VALUE vpos;
4133     rb_encoding *enc = STR_ENC_GET(str);
4134     long pos, len = str_strlen(str, enc); /* str's enc */
4135
4136     if (rb_scan_args(argc, argv, "11", &sub, &vpos) == 2) {
4137         pos = NUM2LONG(vpos);
4138         if (pos < 0) {
4139             pos += len;
4140             if (pos < 0) {
4141                 if (RB_TYPE_P(sub, T_REGEXP)) {
4142                     rb_backref_set(Qnil);
4143                 }
4144                 return Qnil;
4145             }
4146         }
4147         if (pos > len) pos = len;
4148     }
4149     else {
4150         pos = len;
4151     }
4152
4153     if (RB_TYPE_P(sub, T_REGEXP)) {
4154         /* enc = rb_get_check(str, sub); */
4155         pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4156                          enc, single_byte_optimizable(str));
4157
4158         if (rb_reg_search(sub, str, pos, 1) >= 0) {
4159             VALUE match = rb_backref_get();
4160             struct re_registers *regs = RMATCH_REGS(match);
4161             pos = rb_str_sublen(str, BEG(0));
4162             return LONG2NUM(pos);
4163         }
4164     }
4165     else {
4166         StringValue(sub);
4167         pos = rb_str_rindex(str, sub, pos);
4168         if (pos >= 0) return LONG2NUM(pos);
4169     }
4170     return Qnil;
4171 }
4172
4173 /*
4174  *  call-seq:
4175  *    string =~ regexp -> integer or nil
4176  *    string =~ object -> integer or nil
4177  *
4178  *  Returns the \Integer index of the first substring that matches
4179  *  the given +regexp+, or +nil+ if no match found:
4180  *
4181  *    'foo' =~ /f/ # => 0
4182  *    'foo' =~ /o/ # => 1
4183  *    'foo' =~ /x/ # => nil
4184  *
4185  *  Note: also updates
4186  *  {Regexp-related global variables}[Regexp.html#class-Regexp-label-Special+global+variables].
4187  *
4188  *  If the given +object+ is not a \Regexp, returns the value
4189  *  returned by <tt>object =~ self</tt>.
4190  *
4191  *  Note that <tt>string =~ regexp</tt> is different from <tt>regexp =~ string</tt>
4192  *  (see {Regexp#=~}[https://ruby-doc.org/core-2.7.1/Regexp.html#method-i-3D-7E]):
4193  *
4194  *    number= nil
4195  *    "no. 9" =~ /(?<number>\d+)/
4196  *    number # => nil (not assigned)
4197  *    /(?<number>\d+)/ =~ "no. 9"
4198  *    number #=> "9"
4199  *
4200  */
4201
4202 static VALUE
4203 rb_str_match(VALUE x, VALUE y)
4204 {
4205     switch (OBJ_BUILTIN_TYPE(y)) {
4206       case T_STRING:
4207         rb_raise(rb_eTypeError, "type mismatch: String given");
4208
4209       case T_REGEXP:
4210         return rb_reg_match(y, x);
4211
4212       default:
4213         return rb_funcall(y, idEqTilde, 1, x);
4214     }
4215 }
4216
4217
4218 static VALUE get_pat(VALUE);
4219
4220
4221 /*
4222  *  call-seq:
4223  *    match(pattern, offset = 0) -> matchdata or nil
4224  *    match(pattern, offset = 0) {|matchdata| ... } -> object
4225  *
4226  *  Returns a \Matchdata object (or +nil+) based on +self+ and the given +pattern+.
4227  *
4228  *  Note: also updates
4229  *  {Regexp-related global variables}[Regexp.html#class-Regexp-label-Special+global+variables].
4230  *
4231  *  - Computes +regexp+ by converting +pattern+ (if not already a \Regexp).
4232  *      regexp = Regexp.new(pattern)
4233  *  - Computes +matchdata+, which will be either a \MatchData object or +nil+
4234  *    (see Regexp#match):
4235  *      matchdata = <tt>regexp.match(self)
4236  *
4237  *  With no block given, returns the computed +matchdata+:
4238  *
4239  *    'foo'.match('f') # => #<MatchData "f">
4240  *    'foo'.match('o') # => #<MatchData "o">
4241  *    'foo'.match('x') # => nil
4242  *
4243  *  If \Integer argument +offset+ is given, the search begins at index +offset+:
4244  *
4245  *    'foo'.match('f', 1) # => nil
4246  *    'foo'.match('o', 1) # => #<MatchData "o">
4247  *
4248  *  With a block given, calls the block with the computed +matchdata+
4249  *  and returns the block's return value:
4250  *
4251  *    'foo'.match(/o/) {|matchdata| matchdata } # => #<MatchData "o">
4252  *    'foo'.match(/x/) {|matchdata| matchdata } # => nil
4253  *    'foo'.match(/f/, 1) {|matchdata| matchdata } # => nil
4254  *
4255  */
4256
4257 static VALUE
4258 rb_str_match_m(int argc, VALUE *argv, VALUE str)
4259 {
4260     VALUE re, result;
4261     if (argc < 1)
4262         rb_check_arity(argc, 1, 2);
4263     re = argv[0];
4264     argv[0] = str;
4265     result = rb_funcallv(get_pat(re), rb_intern("match"), argc, argv);
4266     if (!NIL_P(result) && rb_block_given_p()) {
4267         return rb_yield(result);
4268     }
4269     return result;
4270 }
4271
4272 /*
4273  *  call-seq:
4274  *    match?(pattern, offset = 0) -> true or false
4275  *
4276  *  Returns +true+ or +false+ based on whether a match is found for +self+ and +pattern+.
4277  *
4278  *  Note: does not update
4279  *  {Regexp-related global variables}[Regexp.html#class-Regexp-label-Special+global+variables].
4280  *
4281  *  Computes +regexp+ by converting +pattern+ (if not already a \Regexp).
4282  *    regexp = Regexp.new(pattern)
4283  *
4284  *  Returns +true+ if <tt>self+.match(regexp)</tt> returns a \Matchdata object,
4285  *  +false+ otherwise:
4286  *
4287  *    'foo'.match?(/o/) # => true
4288  *    'foo'.match?('o') # => true
4289  *    'foo'.match?(/x/) # => false
4290  *
4291  *  If \Integer argument +offset+ is given, the search begins at index +offset+:
4292  *    'foo'.match?('f', 1) # => false
4293  *    'foo'.match?('o', 1) # => true
4294  *
4295  */
4296
4297 static VALUE
4298 rb_str_match_m_p(int argc, VALUE *argv, VALUE str)
4299 {
4300     VALUE re;
4301     rb_check_arity(argc, 1, 2);
4302     re = get_pat(argv[0]);
4303     return rb_reg_match_p(re, str, argc > 1 ? NUM2LONG(argv[1]) : 0);
4304 }
4305
4306 enum neighbor_char {
4307     NEIGHBOR_NOT_CHAR,
4308     NEIGHBOR_FOUND,
4309     NEIGHBOR_WRAPPED
4310 };
4311
4312 static enum neighbor_char
4313 enc_succ_char(char *p, long len, rb_encoding *enc)
4314 {
4315     long i;
4316     int l;
4317
4318     if (rb_enc_mbminlen(enc) > 1) {
4319         /* wchar, trivial case */
4320         int r = rb_enc_precise_mbclen(p, p + len, enc), c;
4321         if (!MBCLEN_CHARFOUND_P(r)) {
4322             return NEIGHBOR_NOT_CHAR;
4323         }
4324         c = rb_enc_mbc_to_codepoint(p, p + len, enc) + 1;
4325         l = rb_enc_code_to_mbclen(c, enc);
4326         if (!l) return NEIGHBOR_NOT_CHAR;
4327         if (l != len) return NEIGHBOR_WRAPPED;
4328         rb_enc_mbcput(c, p, enc);
4329         r = rb_enc_precise_mbclen(p, p + len, enc);
4330         if (!MBCLEN_CHARFOUND_P(r)) {
4331             return NEIGHBOR_NOT_CHAR;
4332         }
4333         return NEIGHBOR_FOUND;
4334     }
4335     while (1) {
4336         for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
4337             p[i] = '\0';
4338         if (i < 0)
4339             return NEIGHBOR_WRAPPED;
4340         ++((unsigned char*)p)[i];
4341         l = rb_enc_precise_mbclen(p, p+len, enc);
4342         if (MBCLEN_CHARFOUND_P(l)) {
4343             l = MBCLEN_CHARFOUND_LEN(l);
4344             if (l == len) {
4345                 return NEIGHBOR_FOUND;
4346             }
4347             else {
4348                 memset(p+l, 0xff, len-l);
4349             }
4350         }
4351         if (MBCLEN_INVALID_P(l) && i < len-1) {
4352             long len2;
4353             int l2;
4354             for (len2 = len-1; 0 < len2; len2--) {
4355                 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
4356                 if (!MBCLEN_INVALID_P(l2))
4357                     break;
4358             }
4359             memset(p+len2+1, 0xff, len-(len2+1));
4360         }
4361     }
4362 }
4363
4364 static enum neighbor_char
4365 enc_pred_char(char *p, long len, rb_encoding *enc)
4366 {
4367     long i;
4368     int l;
4369     if (rb_enc_mbminlen(enc) > 1) {
4370         /* wchar, trivial case */
4371         int r = rb_enc_precise_mbclen(p, p + len, enc), c;
4372         if (!MBCLEN_CHARFOUND_P(r)) {
4373             return NEIGHBOR_NOT_CHAR;
4374         }
4375         c = rb_enc_mbc_to_codepoint(p, p + len, enc);
4376         if (!c) return NEIGHBOR_NOT_CHAR;
4377         --c;
4378         l = rb_enc_code_to_mbclen(c, enc);
4379         if (!l) return NEIGHBOR_NOT_CHAR;
4380         if (l != len) return NEIGHBOR_WRAPPED;
4381         rb_enc_mbcput(c, p, enc);
4382         r = rb_enc_precise_mbclen(p, p + len, enc);
4383         if (!MBCLEN_CHARFOUND_P(r)) {
4384             return NEIGHBOR_NOT_CHAR;
4385         }
4386         return NEIGHBOR_FOUND;
4387     }
4388     while (1) {
4389         for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
4390             p[i] = '\xff';
4391         if (i < 0)
4392             return NEIGHBOR_WRAPPED;
4393         --((unsigned char*)p)[i];
4394         l = rb_enc_precise_mbclen(p, p+len, enc);
4395         if (MBCLEN_CHARFOUND_P(l)) {
4396             l = MBCLEN_CHARFOUND_LEN(l);
4397             if (l == len) {
4398                 return NEIGHBOR_FOUND;
4399             }
4400             else {
4401                 memset(p+l, 0, len-l);
4402             }
4403         }
4404         if (MBCLEN_INVALID_P(l) && i < len-1) {
4405             long len2;
4406             int l2;
4407             for (len2 = len-1; 0 < len2; len2--) {
4408                 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
4409                 if (!MBCLEN_INVALID_P(l2))
4410                     break;
4411             }
4412             memset(p+len2+1, 0, len-(len2+1));
4413         }
4414     }
4415 }
4416
4417 /*
4418   overwrite +p+ by succeeding letter in +enc+ and returns
4419   NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
4420   When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
4421   assuming each ranges are successive, and mbclen
4422   never change in each ranges.
4423   NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
4424   character.
4425  */
4426 static enum neighbor_char
4427 enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
4428 {
4429     enum neighbor_char ret;
4430     unsigned int c;
4431     int ctype;
4432     int range;
4433     char save[ONIGENC_CODE_TO_MBC_MAXLEN];
4434
4435     /* skip 03A2, invalid char between GREEK CAPITAL LETTERS */
4436     int try;
4437     const int max_gaps = 1;
4438
4439     c = rb_enc_mbc_to_codepoint(p, p+len, enc);
4440     if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
4441         ctype = ONIGENC_CTYPE_DIGIT;
4442     else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
4443         ctype = ONIGENC_CTYPE_ALPHA;
4444     else
4445         return NEIGHBOR_NOT_CHAR;
4446
4447     MEMCPY(save, p, char, len);
4448     for (try = 0; try <= max_gaps; ++try) {
4449         ret = enc_succ_char(p, len, enc);
4450         if (ret == NEIGHBOR_FOUND) {
4451             c = rb_enc_mbc_to_codepoint(p, p+len, enc);
4452             if (rb_enc_isctype(c, ctype, enc))
4453                 return NEIGHBOR_FOUND;
4454         }
4455     }
4456     MEMCPY(p, save, char, len);
4457     range = 1;
4458     while (1) {
4459         MEMCPY(save, p, char, len);
4460         ret = enc_pred_char(p, len, enc);
4461         if (ret == NEIGHBOR_FOUND) {
4462             c = rb_enc_mbc_to_codepoint(p, p+len, enc);
4463             if (!rb_enc_isctype(c, ctype, enc)) {
4464                 MEMCPY(p, save, char, len);
4465                 break;
4466             }
4467         }
4468         else {
4469             MEMCPY(p, save, char, len);
4470             break;
4471         }
4472         range++;
4473     }
4474     if (range == 1) {
4475         return NEIGHBOR_NOT_CHAR;
4476     }
4477
4478     if (ctype != ONIGENC_CTYPE_DIGIT) {
4479         MEMCPY(carry, p, char, len);
4480         return NEIGHBOR_WRAPPED;
4481     }
4482
4483     MEMCPY(carry, p, char, len);
4484     enc_succ_char(carry, len, enc);
4485     return NEIGHBOR_WRAPPED;
4486 }
4487
4488
4489 static VALUE str_succ(VALUE str);
4490
4491 /*
4492  *  call-seq:
4493  *    succ -> new_str
4494  *
4495  *  Returns the successor to +self+. The successor is calculated by
4496  *  incrementing characters.
4497  *
4498  *  The first character to be incremented is the rightmost alphanumeric:
4499  *  or, if no alphanumerics, the rightmost character:
4500  *
4501  *    'THX1138'.succ # => "THX1139"
4502  *    '<<koala>>'.succ # => "<<koalb>>"
4503  *    '***'.succ # => '**+'
4504  *
4505  *  The successor to a digit is another digit, "carrying" to the next-left
4506  *  character for a "rollover" from 9 to 0, and prepending another digit
4507  *  if necessary:
4508  *
4509  *    '00'.succ # => "01"
4510  *    '09'.succ # => "10"
4511  *    '99'.succ # => "100"
4512  *
4513  *  The successor to a letter is another letter of the same case,
4514  *  carrying to the next-left character for a rollover,
4515  *  and prepending another same-case letter if necessary:
4516  *
4517  *    'aa'.succ # => "ab"
4518  *    'az'.succ # => "ba"
4519  *    'zz'.succ # => "aaa"
4520  *    'AA'.succ # => "AB"
4521  *    'AZ'.succ # => "BA"
4522  *    'ZZ'.succ # => "AAA"
4523  *
4524  *  The successor to a non-alphanumeric character is the next character
4525  *  in the underlying character set's collating sequence,
4526  *  carrying to the next-left character for a rollover,
4527  *  and prepending another character if necessary:
4528  *
4529  *    s = 0.chr * 3
4530  *    s # => "\x00\x00\x00"
4531  *    s.succ # => "\x00\x00\x01"
4532  *    s = 255.chr * 3
4533  *    s # => "\xFF\xFF\xFF"
4534  *    s.succ # => "\x01\x00\x00\x00"
4535  *
4536  *  Carrying can occur between and among mixtures of alphanumeric characters:
4537  *
4538  *    s = 'zz99zz99'
4539  *    s.succ # => "aaa00aa00"
4540  *    s = '99zz99zz'
4541  *    s.succ # => "100aa00aa"
4542  *
4543  *  The successor to an empty \String is a new empty \String:
4544  *
4545  *    ''.succ # => ""
4546  *
4547  *  String#next is an alias for String#succ.
4548  */
4549
4550 VALUE
4551 rb_str_succ(VALUE orig)
4552 {
4553     VALUE str;
4554     str = rb_str_new(RSTRING_PTR(orig), RSTRING_LEN(orig));
4555     rb_enc_cr_str_copy_for_substr(str, orig);
4556     return str_succ(str);
4557 }
4558
4559 static VALUE
4560 str_succ(VALUE str)
4561 {
4562     rb_encoding *enc;
4563     char *sbeg, *s, *e, *last_alnum = 0;
4564     int found_alnum = 0;
4565     long l, slen;
4566     char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
4567     long carry_pos = 0, carry_len = 1;
4568     enum neighbor_char neighbor = NEIGHBOR_FOUND;
4569
4570     slen = RSTRING_LEN(str);
4571     if (slen == 0) return str;
4572
4573     enc = STR_ENC_GET(str);
4574     sbeg = RSTRING_PTR(str);
4575     s = e = sbeg + slen;
4576
4577     while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
4578         if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
4579             if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
4580                 ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
4581                 break;
4582             }
4583         }
4584         l = rb_enc_precise_mbclen(s, e, enc);
4585         if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
4586         l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
4587         neighbor = enc_succ_alnum_char(s, l, enc, carry);
4588         switch (neighbor) {
4589           case NEIGHBOR_NOT_CHAR:
4590             continue;
4591           case NEIGHBOR_FOUND:
4592             return str;
4593           case NEIGHBOR_WRAPPED:
4594             last_alnum = s;
4595             break;
4596         }
4597         found_alnum = 1;
4598         carry_pos = s - sbeg;
4599         carry_len = l;
4600     }
4601     if (!found_alnum) {         /* str contains no alnum */
4602         s = e;
4603         while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
4604             enum neighbor_char neighbor;
4605             char tmp[ONIGENC_CODE_TO_MBC_MAXLEN];
4606             l = rb_enc_precise_mbclen(s, e, enc);
4607             if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
4608             l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
4609             MEMCPY(tmp, s, char, l);
4610             neighbor = enc_succ_char(tmp, l, enc);
4611             switch (neighbor) {
4612               case NEIGHBOR_FOUND:
4613                 MEMCPY(s, tmp, char, l);
4614                 return str;
4615                 break;
4616               case NEIGHBOR_WRAPPED:
4617                 MEMCPY(s, tmp, char, l);
4618                 break;
4619               case NEIGHBOR_NOT_CHAR:
4620                 break;
4621             }
4622             if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
4623                 /* wrapped to \0...\0.  search next valid char. */
4624                 enc_succ_char(s, l, enc);
4625             }
4626             if (!rb_enc_asciicompat(enc)) {
4627                 MEMCPY(carry, s, char, l);
4628                 carry_len = l;
4629             }
4630             carry_pos = s - sbeg;
4631         }
4632         ENC_CODERANGE_SET(str, ENC_CODERANGE_UNKNOWN);
4633     }
4634     RESIZE_CAPA(str, slen + carry_len);
4635     sbeg = RSTRING_PTR(str);
4636     s = sbeg + carry_pos;
4637     memmove(s + carry_len, s, slen - carry_pos);
4638     memmove(s, carry, carry_len);
4639     slen += carry_len;
4640     STR_SET_LEN(str, slen);
4641     TERM_FILL(&sbeg[slen], rb_enc_mbminlen(enc));
4642     rb_enc_str_coderange(str);
4643     return str;
4644 }
4645
4646
4647 /*
4648  *  call-seq:
4649  *    succ! -> self
4650  *
4651  *  Equivalent to String#succ, but modifies +self+ in place; returns +self+.
4652  *
4653  *  String#next! is an alias for String#succ!.
4654  */
4655
4656 static VALUE
4657 rb_str_succ_bang(VALUE str)
4658 {
4659     rb_str_modify(str);
4660     str_succ(str);
4661     return str;
4662 }
4663
4664 static int
4665 all_digits_p(const char *s, long len)
4666 {
4667     while (len-- > 0) {
4668         if (!ISDIGIT(*s)) return 0;
4669         s++;
4670     }
4671     return 1;
4672 }
4673
4674 static int
4675 str_upto_i(VALUE str, VALUE arg)
4676 {
4677     rb_yield(str);
4678     return 0;
4679 }
4680
4681 /*
4682  *  call-seq:
4683  *    upto(other_string, exclusive = false) {|string| ... } -> self
4684  *    upto(other_string, exclusive = false) -> new_enumerator
4685  *
4686  *  With a block given, calls the block with each \String value
4687  *  returned by successive calls to String#succ;
4688  *  the first value is +self+, the next is <tt>self.succ</tt>, and so on;
4689  *  the sequence terminates when value +other_string+ is reached;
4690  *  returns +self+:
4691  *
4692  *    'a8'.upto('b6') {|s| print s, ' ' } # => "a8"
4693  *  Output:
4694  *
4695  *    a8 a9 b0 b1 b2 b3 b4 b5 b6
4696  *
4697  *  If argument +exclusive+ is given as a truthy object, the last value is omitted:
4698  *
4699  *    'a8'.upto('b6', true) {|s| print s, ' ' } # => "a8"
4700  *
4701  *  Output:
4702  *
4703  *    a8 a9 b0 b1 b2 b3 b4 b5
4704  *
4705  *  If +other_string+ would not be reached, does not call the block:
4706  *
4707  *    '25'.upto('5') {|s| fail s }
4708  *    'aa'.upto('a') {|s| fail s }
4709  *
4710  *  With no block given, returns a new \Enumerator:
4711  *
4712  *    'a8'.upto('b6') # => #<Enumerator: "a8":upto("b6")>
4713  *
4714  */
4715
4716 static VALUE
4717 rb_str_upto(int argc, VALUE *argv, VALUE beg)
4718 {
4719     VALUE end, exclusive;
4720
4721     rb_scan_args(argc, argv, "11", &end, &exclusive);
4722     RETURN_ENUMERATOR(beg, argc, argv);
4723     return rb_str_upto_each(beg, end, RTEST(exclusive), str_upto_i, Qnil);
4724 }
4725
4726 VALUE
4727 rb_str_upto_each(VALUE beg, VALUE end, int excl, int (*each)(VALUE, VALUE), VALUE arg)
4728 {
4729     VALUE current, after_end;
4730     ID succ;
4731     int n, ascii;
4732     rb_encoding *enc;
4733
4734     CONST_ID(succ, "succ");
4735     StringValue(end);
4736     enc = rb_enc_check(beg, end);
4737     ascii = (is_ascii_string(beg) && is_ascii_string(end));
4738     /* single character */
4739     if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
4740         char c = RSTRING_PTR(beg)[0];
4741         char e = RSTRING_PTR(end)[0];
4742
4743         if (c > e || (excl && c == e)) return beg;
4744         for (;;) {
4745             if ((*each)(rb_enc_str_new(&c, 1, enc), arg)) break;
4746             if (!excl && c == e) break;
4747             c++;
4748             if (excl && c == e) break;
4749         }
4750         return beg;
4751     }
4752     /* both edges are all digits */
4753     if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0]) &&
4754         all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg)) &&
4755         all_digits_p(RSTRING_PTR(end), RSTRING_LEN(end))) {
4756         VALUE b, e;
4757         int width;
4758
4759         width = RSTRING_LENINT(beg);
4760         b = rb_str_to_inum(beg, 10, FALSE);
4761         e = rb_str_to_inum(end, 10, FALSE);
4762         if (FIXNUM_P(b) && FIXNUM_P(e)) {
4763             long bi = FIX2LONG(b);
4764             long ei = FIX2LONG(e);
4765             rb_encoding *usascii = rb_usascii_encoding();
4766
4767             while (bi <= ei) {
4768                 if (excl && bi == ei) break;
4769                 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
4770                 bi++;
4771             }
4772         }
4773         else {
4774             ID op = excl ? '<' : idLE;
4775             VALUE args[2], fmt = rb_fstring_lit("%.*d");
4776
4777             args[0] = INT2FIX(width);
4778             while (rb_funcall(b, op, 1, e)) {
4779                 args[1] = b;
4780                 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
4781                 b = rb_funcallv(b, succ, 0, 0);
4782             }
4783         }
4784         return beg;
4785     }
4786     /* normal case */
4787     n = rb_str_cmp(beg, end);
4788     if (n > 0 || (excl && n == 0)) return beg;
4789
4790     after_end = rb_funcallv(end, succ, 0, 0);
4791     current = str_duplicate(rb_cString, beg);
4792     while (!rb_str_equal(current, after_end)) {
4793         VALUE next = Qnil;
4794         if (excl || !rb_str_equal(current, end))
4795             next = rb_funcallv(current, succ, 0, 0);
4796         if ((*each)(current, arg)) break;
4797         if (NIL_P(next)) break;
4798         current = next;
4799         StringValue(current);
4800         if (excl && rb_str_equal(current, end)) break;
4801         if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
4802             break;
4803     }
4804
4805     return beg;
4806 }
4807
4808 VALUE
4809 rb_str_upto_endless_each(VALUE beg, int (*each)(VALUE, VALUE), VALUE arg)
4810 {
4811     VALUE current;
4812     ID succ;
4813
4814     CONST_ID(succ, "succ");
4815     /* both edges are all digits */
4816     if (is_ascii_string(beg) && ISDIGIT(RSTRING_PTR(beg)[0]) &&
4817         all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg))) {
4818         VALUE b, args[2], fmt = rb_fstring_lit("%.*d");
4819         int width = RSTRING_LENINT(beg);
4820         b = rb_str_to_inum(beg, 10, FALSE);
4821         if (FIXNUM_P(b)) {
4822             long bi = FIX2LONG(b);
4823             rb_encoding *usascii = rb_usascii_encoding();
4824
4825             while (FIXABLE(bi)) {
4826                 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
4827                 bi++;
4828             }
4829             b = LONG2NUM(bi);
4830         }
4831         args[0] = INT2FIX(width);
4832         while (1) {
4833             args[1] = b;
4834             if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
4835             b = rb_funcallv(b, succ, 0, 0);
4836         }
4837     }
4838     /* normal case */
4839     current = str_duplicate(rb_cString, beg);
4840     while (1) {
4841         VALUE next = rb_funcallv(current, succ, 0, 0);
4842         if ((*each)(current, arg)) break;
4843         current = next;
4844         StringValue(current);
4845         if (RSTRING_LEN(current) == 0)
4846             break;
4847     }
4848
4849     return beg;
4850 }
4851
4852 static int
4853 include_range_i(VALUE str, VALUE arg)
4854 {
4855     VALUE *argp = (VALUE *)arg;
4856     if (!rb_equal(str, *argp)) return 0;
4857     *argp = Qnil;
4858     return 1;
4859 }
4860
4861 VALUE
4862 rb_str_include_range_p(VALUE beg, VALUE end, VALUE val, VALUE exclusive)
4863 {
4864     beg = rb_str_new_frozen(beg);
4865     StringValue(end);
4866     end = rb_str_new_frozen(end);
4867     if (NIL_P(val)) return Qfalse;
4868     val = rb_check_string_type(val);
4869     if (NIL_P(val)) return Qfalse;
4870     if (rb_enc_asciicompat(STR_ENC_GET(beg)) &&
4871         rb_enc_asciicompat(STR_ENC_GET(end)) &&
4872         rb_enc_asciicompat(STR_ENC_GET(val))) {
4873         const char *bp = RSTRING_PTR(beg);
4874         const char *ep = RSTRING_PTR(end);
4875         const char *vp = RSTRING_PTR(val);
4876         if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1) {
4877             if (RSTRING_LEN(val) == 0 || RSTRING_LEN(val) > 1)
4878                 return Qfalse;
4879             else {
4880                 char b = *bp;
4881                 char e = *ep;
4882                 char v = *vp;
4883
4884                 if (ISASCII(b) && ISASCII(e) && ISASCII(v)) {
4885                     if (b <= v && v < e) return Qtrue;
4886                     return RBOOL(!RTEST(exclusive) && v == e);
4887                 }
4888             }
4889         }
4890 #if 0
4891         /* both edges are all digits */
4892         if (ISDIGIT(*bp) && ISDIGIT(*ep) &&
4893             all_digits_p(bp, RSTRING_LEN(beg)) &&
4894             all_digits_p(ep, RSTRING_LEN(end))) {
4895             /* TODO */
4896         }
4897 #endif
4898     }
4899     rb_str_upto_each(beg, end, RTEST(exclusive), include_range_i, (VALUE)&val);
4900
4901     return RBOOL(NIL_P(val));
4902 }
4903
4904 static VALUE
4905 rb_str_subpat(VALUE str, VALUE re, VALUE backref)
4906 {
4907     if (rb_reg_search(re, str, 0, 0) >= 0) {
4908         VALUE match = rb_backref_get();
4909         int nth = rb_reg_backref_number(match, backref);
4910         return rb_reg_nth_match(nth, match);
4911     }
4912     return Qnil;
4913 }
4914
4915 static VALUE
4916 rb_str_aref(VALUE str, VALUE indx)
4917 {
4918     long idx;
4919
4920     if (FIXNUM_P(indx)) {
4921         idx = FIX2LONG(indx);
4922     }
4923     else if (RB_TYPE_P(indx, T_REGEXP)) {
4924         return rb_str_subpat(str, indx, INT2FIX(0));
4925     }
4926     else if (RB_TYPE_P(indx, T_STRING)) {
4927         if (rb_str_index(str, indx, 0) != -1)
4928             return str_duplicate(rb_cString, indx);
4929         return Qnil;
4930     }
4931     else {
4932         /* check if indx is Range */
4933         long beg, len = str_strlen(str, NULL);
4934         switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
4935           case Qfalse:
4936             break;
4937           case Qnil:
4938             return Qnil;
4939           default:
4940             return rb_str_substr(str, beg, len);
4941         }
4942         idx = NUM2LONG(indx);
4943     }
4944
4945     return str_substr(str, idx, 1, FALSE);
4946 }
4947
4948
4949 /*
4950  *  call-seq:
4951  *    string[index] -> new_string or nil
4952  *    string[start, length] -> new_string or nil
4953  *    string[range] -> new_string or nil
4954  *    string[regexp, capture = 0] -> new_string or nil
4955  *    string[substring] -> new_string or nil
4956  *
4957  *  Returns the substring of +self+ specified by the arguments.
4958  *
4959  *  When the single \Integer argument +index+ is given,
4960  *  returns the 1-character substring found in +self+ at offset +index+:
4961  *
4962  *    'bar'[2] # => "r"
4963  *
4964  *  Counts backward from the end of +self+ if +index+ is negative:
4965  *
4966  *    'foo'[-3] # => "f"
4967  *
4968  *  Returns +nil+ if +index+ is out of range:
4969  *
4970  *    'foo'[3] # => nil
4971  *    'foo'[-4] # => nil
4972  *
4973  *  When the two \Integer arguments  +start+ and +length+ are given,
4974  *  returns the substring of the given +length+ found in +self+ at offset +start+:
4975  *
4976  *    'foo'[0, 2] # => "fo"
4977  *    'foo'[0, 0] # => ""
4978  *
4979  *  Counts backward from the end of +self+ if +start+ is negative:
4980  *
4981  *    'foo'[-2, 2] # => "oo"
4982  *
4983  *  Special case: returns a new empty \String if +start+ is equal to the length of +self+:
4984  *
4985  *    'foo'[3, 2] # => ""
4986  *
4987  *  Returns +nil+ if +start+ is out of range:
4988  *
4989  *    'foo'[4, 2] # => nil
4990  *    'foo'[-4, 2] # => nil
4991  *
4992  *  Returns the trailing substring of +self+ if +length+ is large:
4993  *
4994  *    'foo'[1, 50] # => "oo"
4995  *
4996  *  Returns +nil+ if +length+ is negative:
4997  *
4998  *    'foo'[0, -1] # => nil
4999  *
5000  *  When the single \Range argument +range+ is given,
5001  *  derives +start+ and +length+ values from the given +range+,
5002  *  and returns values as above:
5003  *
5004  *  - <tt>'foo'[0..1]</tt> is equivalent to <tt>'foo'[0, 2]</tt>.
5005  *  - <tt>'foo'[0...1]</tt> is equivalent to <tt>'foo'[0, 1]</tt>.
5006  *
5007  *  When the \Regexp argument +regexp+ is given,
5008  *  and the +capture+ argument is <tt>0</tt>,
5009  *  returns the first matching substring found in +self+,
5010  *  or +nil+ if none found:
5011  *
5012  *    'foo'[/o/] # => "o"
5013  *    'foo'[/x/] # => nil
5014  *    s = 'hello there'
5015  *    s[/[aeiou](.)\1/] # => "ell"
5016  *    s[/[aeiou](.)\1/, 0] # => "ell"
5017  *
5018  *  If argument +capture+ is given and not <tt>0</tt>,
5019  *  it should be either an \Integer capture group index or a \String or \Symbol capture group name;
5020  *  the method call returns only the specified capture
5021  *  (see {Regexp Capturing}[Regexp.html#class-Regexp-label-Capturing]):
5022  *
5023  *    s = 'hello there'
5024  *    s[/[aeiou](.)\1/, 1] # => "l"
5025  *    s[/(?<vowel>[aeiou])(?<non_vowel>[^aeiou])/, "non_vowel"] # => "l"
5026  *    s[/(?<vowel>[aeiou])(?<non_vowel>[^aeiou])/, :vowel] # => "e"
5027  *
5028  *  If an invalid capture group index is given, +nil+ is returned.  If an invalid
5029  *  capture group name is given, +IndexError+ is raised.
5030  *
5031  *  When the single \String argument +substring+ is given,
5032  *  returns the substring from +self+ if found, otherwise +nil+:
5033  *
5034  *    'foo'['oo'] # => "oo"
5035  *    'foo'['xx'] # => nil
5036  *
5037  *  String#slice is an alias for String#[].
5038  */
5039
5040 static VALUE
5041 rb_str_aref_m(int argc, VALUE *argv, VALUE str)
5042 {
5043     if (argc == 2) {
5044         if (RB_TYPE_P(argv[0], T_REGEXP)) {
5045             return rb_str_subpat(str, argv[0], argv[1]);
5046         }
5047         else {
5048             long beg = NUM2LONG(argv[0]);
5049             long len = NUM2LONG(argv[1]);
5050             return rb_str_substr(str, beg, len);
5051         }
5052     }
5053     rb_check_arity(argc, 1, 2);
5054     return rb_str_aref(str, argv[0]);
5055 }
5056
5057 VALUE
5058 rb_str_drop_bytes(VALUE str, long len)
5059 {
5060     char *ptr = RSTRING_PTR(str);
5061     long olen = RSTRING_LEN(str), nlen;
5062
5063     str_modifiable(str);
5064     if (len > olen) len = olen;
5065     nlen = olen - len;
5066     if (str_embed_capa(str) >= nlen + TERM_LEN(str)) {
5067         char *oldptr = ptr;
5068         int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|STR_SHARED|STR_NOFREE));
5069         STR_SET_EMBED(str);
5070         STR_SET_EMBED_LEN(str, nlen);
5071         ptr = RSTRING(str)->as.embed.ary;
5072         memmove(ptr, oldptr + len, nlen);
5073         if (fl == STR_NOEMBED) xfree(oldptr);
5074     }
5075     else {
5076         if (!STR_SHARED_P(str)) {
5077             VALUE shared = heap_str_make_shared(rb_obj_class(str), str);
5078             rb_enc_cr_str_exact_copy(shared, str);
5079             OBJ_FREEZE(shared);
5080         }
5081         ptr = RSTRING(str)->as.heap.ptr += len;
5082         RSTRING(str)->as.heap.len = nlen;
5083     }
5084     ptr[nlen] = 0;
5085     ENC_CODERANGE_CLEAR(str);
5086     return str;
5087 }
5088
5089 static void
5090 rb_str_splice_0(VALUE str, long beg, long len, VALUE val)
5091 {
5092     char *sptr;
5093     long slen, vlen = RSTRING_LEN(val);
5094     int cr;
5095
5096     if (beg == 0 && vlen == 0) {
5097         rb_str_drop_bytes(str, len);
5098         return;
5099     }
5100
5101     str_modify_keep_cr(str);
5102     RSTRING_GETMEM(str, sptr, slen);
5103     if (len < vlen) {
5104         /* expand string */
5105         RESIZE_CAPA(str, slen + vlen - len);
5106         sptr = RSTRING_PTR(str);
5107     }
5108
5109     if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT)
5110         cr = rb_enc_str_coderange(val);
5111     else
5112         cr = ENC_CODERANGE_UNKNOWN;
5113
5114     if (vlen != len) {
5115         memmove(sptr + beg + vlen,
5116                 sptr + beg + len,
5117                 slen - (beg + len));
5118     }
5119     if (vlen < beg && len < 0) {
5120         MEMZERO(sptr + slen, char, -len);
5121     }
5122     if (vlen > 0) {
5123         memmove(sptr + beg, RSTRING_PTR(val), vlen);
5124     }
5125     slen += vlen - len;
5126     STR_SET_LEN(str, slen);
5127     TERM_FILL(&sptr[slen], TERM_LEN(str));
5128     ENC_CODERANGE_SET(str, cr);
5129 }
5130
5131 void
5132 rb_str_update(VALUE str, long beg, long len, VALUE val)
5133 {
5134     long slen;
5135     char *p, *e;
5136     rb_encoding *enc;
5137     int singlebyte = single_byte_optimizable(str);
5138     int cr;
5139
5140     if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
5141
5142     StringValue(val);
5143     enc = rb_enc_check(str, val);
5144     slen = str_strlen(str, enc); /* rb_enc_check */
5145
5146     if ((slen < beg) || ((beg < 0) && (beg + slen < 0))) {
5147         rb_raise(rb_eIndexError, "index %ld out of string", beg);
5148     }
5149     if (beg < 0) {
5150         beg += slen;
5151     }
5152     assert(beg >= 0);
5153     assert(beg <= slen);
5154     if (len > slen - beg) {
5155         len = slen - beg;
5156     }
5157     str_modify_keep_cr(str);
5158     p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
5159     if (!p) p = RSTRING_END(str);
5160     e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
5161     if (!e) e = RSTRING_END(str);
5162     /* error check */
5163     beg = p - RSTRING_PTR(str); /* physical position */
5164     len = e - p;                /* physical length */
5165     rb_str_splice_0(str, beg, len, val);
5166     rb_enc_associate(str, enc);
5167     cr = ENC_CODERANGE_AND(ENC_CODERANGE(str), ENC_CODERANGE(val));
5168     if (cr != ENC_CODERANGE_BROKEN)
5169         ENC_CODERANGE_SET(str, cr);
5170 }
5171
5172 #define rb_str_splice(str, beg, len, val) rb_str_update(str, beg, len, val)
5173
5174 static void
5175 rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
5176 {
5177     int nth;
5178     VALUE match;
5179     long start, end, len;
5180     rb_encoding *enc;
5181     struct re_registers *regs;
5182
5183     if (rb_reg_search(re, str, 0, 0) < 0) {
5184         rb_raise(rb_eIndexError, "regexp not matched");
5185     }
5186     match = rb_backref_get();
5187     nth = rb_reg_backref_number(match, backref);
5188     regs = RMATCH_REGS(match);
5189     if ((nth >= regs->num_regs) || ((nth < 0) && (-nth >= regs->num_regs))) {
5190         rb_raise(rb_eIndexError, "index %d out of regexp", nth);
5191     }
5192     if (nth < 0) {
5193         nth += regs->num_regs;
5194     }
5195
5196     start = BEG(nth);
5197     if (start == -1) {
5198         rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
5199     }
5200     end = END(nth);
5201     len = end - start;
5202     StringValue(val);
5203     enc = rb_enc_check_str(str, val);
5204     rb_str_splice_0(str, start, len, val);
5205     rb_enc_associate(str, enc);
5206 }
5207
5208 static VALUE
5209 rb_str_aset(VALUE str, VALUE indx, VALUE val)
5210 {
5211     long idx, beg;
5212
5213     switch (TYPE(indx)) {
5214       case T_REGEXP:
5215         rb_str_subpat_set(str, indx, INT2FIX(0), val);
5216         return val;
5217
5218       case T_STRING:
5219         beg = rb_str_index(str, indx, 0);
5220         if (beg < 0) {
5221             rb_raise(rb_eIndexError, "string not matched");
5222         }
5223         beg = rb_str_sublen(str, beg);
5224         rb_str_splice(str, beg, str_strlen(indx, NULL), val);
5225         return val;
5226
5227       default:
5228         /* check if indx is Range */
5229         {
5230             long beg, len;
5231             if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 2)) {
5232                 rb_str_splice(str, beg, len, val);
5233                 return val;
5234             }
5235         }
5236         /* FALLTHROUGH */
5237
5238       case T_FIXNUM:
5239         idx = NUM2LONG(indx);
5240         rb_str_splice(str, idx, 1, val);
5241         return val;
5242     }
5243 }
5244
5245 /*
5246  *  call-seq:
5247  *     str[integer] = new_str
5248  *     str[integer, integer] = new_str
5249  *     str[range] = aString
5250  *     str[regexp] = new_str
5251  *     str[regexp, integer] = new_str
5252  *     str[regexp, name] = new_str
5253  *     str[other_str] = new_str
5254  *
5255  *  Element Assignment---Replaces some or all of the content of
5256  *  <i>str</i>. The portion of the string affected is determined using
5257  *  the same criteria as String#[]. If the replacement string is not
5258  *  the same length as the text it is replacing, the string will be
5259  *  adjusted accordingly. If the regular expression or string is used
5260  *  as the index doesn't match a position in the string, IndexError is
5261  *  raised. If the regular expression form is used, the optional
5262  *  second Integer allows you to specify which portion of the match to
5263  *  replace (effectively using the MatchData indexing rules. The forms
5264  *  that take an Integer will raise an IndexError if the value is out
5265  *  of range; the Range form will raise a RangeError, and the Regexp
5266  *  and String will raise an IndexError on negative match.
5267  */
5268
5269 static VALUE
5270 rb_str_aset_m(int argc, VALUE *argv, VALUE str)
5271 {
5272     if (argc == 3) {
5273         if (RB_TYPE_P(argv[0], T_REGEXP)) {
5274             rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
5275         }
5276         else {
5277             rb_str_splice(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
5278         }
5279         return argv[2];
5280     }
5281     rb_check_arity(argc, 2, 3);
5282     return rb_str_aset(str, argv[0], argv[1]);
5283 }
5284
5285 /*
5286  *  call-seq:
5287  *    insert(index, other_string) -> self
5288  *
5289  *  Inserts the given +other_string+ into +self+; returns +self+.
5290  *
5291  *  If the \Integer +index+ is positive, inserts +other_string+ at offset +index+:
5292  *
5293  *    'foo'.insert(1, 'bar') # => "fbaroo"
5294  *
5295  *  If the \Integer +index+ is negative, counts backward from the end of +self+
5296  *  and inserts +other_string+ at offset <tt>index+1</tt>
5297  *  (that is, _after_ <tt>self[index]</tt>):
5298  *
5299  *    'foo'.insert(-2, 'bar') # => "fobaro"
5300  *
5301  */
5302
5303 static VALUE
5304 rb_str_insert(VALUE str, VALUE idx, VALUE str2)
5305 {
5306     long pos = NUM2LONG(idx);
5307
5308     if (pos == -1) {
5309         return rb_str_append(str, str2);
5310     }
5311     else if (pos < 0) {
5312         pos++;
5313     }
5314     rb_str_splice(str, pos, 0, str2);
5315     return str;
5316 }
5317
5318
5319 /*
5320  *  call-seq:
5321  *     slice!(index)               -> new_string or nil
5322  *     slice!(start, length)       -> new_string or nil
5323  *     slice!(range)               -> new_string or nil
5324  *     slice!(regexp, capture = 0) -> new_string or nil
5325  *     slice!(substring)           -> new_string or nil
5326  *
5327  *  Removes the substring of +self+ specified by the arguments;
5328  *  returns the removed substring.
5329  *
5330  *  See String#[] for details about the arguments that specify the substring.
5331  *
5332  *  A few examples:
5333  *
5334  *     string = "This is a string"
5335  *     string.slice!(2)        #=> "i"
5336  *     string.slice!(3..6)     #=> " is "
5337  *     string.slice!(/s.*t/)   #=> "sa st"
5338  *     string.slice!("r")      #=> "r"
5339  *     string                  #=> "Thing"
5340  *
5341  */
5342
5343 static VALUE
5344 rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
5345 {
5346     VALUE result = Qnil;
5347     VALUE indx;
5348     long beg, len = 1;
5349     char *p;
5350
5351     rb_check_arity(argc, 1, 2);
5352     str_modify_keep_cr(str);
5353     indx = argv[0];
5354     if (RB_TYPE_P(indx, T_REGEXP)) {
5355         if (rb_reg_search(indx, str, 0, 0) < 0) return Qnil;
5356         VALUE match = rb_backref_get();
5357         struct re_registers *regs = RMATCH_REGS(match);
5358         int nth = 0;
5359         if (argc > 1 && (nth = rb_reg_backref_number(match, argv[1])) < 0) {
5360             if ((nth += regs->num_regs) <= 0) return Qnil;
5361         }
5362         else if (nth >= regs->num_regs) return Qnil;
5363         beg = BEG(nth);
5364         len = END(nth) - beg;
5365         goto subseq;
5366     }
5367     else if (argc == 2) {
5368         beg = NUM2LONG(indx);
5369         len = NUM2LONG(argv[1]);
5370         goto num_index;
5371     }
5372     else if (FIXNUM_P(indx)) {
5373         beg = FIX2LONG(indx);
5374         if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
5375         if (!len) return Qnil;
5376         beg = p - RSTRING_PTR(str);
5377         goto subseq;
5378     }
5379     else if (RB_TYPE_P(indx, T_STRING)) {
5380         beg = rb_str_index(str, indx, 0);
5381         if (beg == -1) return Qnil;
5382         len = RSTRING_LEN(indx);
5383         result = str_duplicate(rb_cString, indx);
5384         goto squash;
5385     }
5386     else {
5387         switch (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 0)) {
5388           case Qnil:
5389             return Qnil;
5390           case Qfalse:
5391             beg = NUM2LONG(indx);
5392             if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
5393             if (!len) return Qnil;
5394             beg = p - RSTRING_PTR(str);
5395             goto subseq;
5396           default:
5397             goto num_index;
5398         }
5399     }
5400
5401   num_index:
5402     if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
5403     beg = p - RSTRING_PTR(str);
5404
5405   subseq:
5406     result = rb_str_new(RSTRING_PTR(str)+beg, len);
5407     rb_enc_cr_str_copy_for_substr(result, str);
5408
5409   squash:
5410     if (len > 0) {
5411         if (beg == 0) {
5412             rb_str_drop_bytes(str, len);
5413         }
5414         else {
5415             char *sptr = RSTRING_PTR(str);
5416             long slen = RSTRING_LEN(str);
5417             if (beg + len > slen) /* pathological check */
5418                 len = slen - beg;
5419             memmove(sptr + beg,
5420                     sptr + beg + len,
5421                     slen - (beg + len));
5422             slen -= len;
5423             STR_SET_LEN(str, slen);
5424             TERM_FILL(&sptr[slen], TERM_LEN(str));
5425         }
5426     }
5427     return result;
5428 }
5429
5430 static VALUE
5431 get_pat(VALUE pat)
5432 {
5433     VALUE val;
5434
5435     switch (OBJ_BUILTIN_TYPE(pat)) {
5436       case T_REGEXP:
5437         return pat;
5438
5439       case T_STRING:
5440         break;
5441
5442       default:
5443         val = rb_check_string_type(pat);
5444         if (NIL_P(val)) {
5445             Check_Type(pat, T_REGEXP);
5446         }
5447         pat = val;
5448     }
5449
5450     return rb_reg_regcomp(pat);
5451 }
5452
5453 static VALUE
5454 get_pat_quoted(VALUE pat, int check)
5455 {
5456     VALUE val;
5457
5458     switch (OBJ_BUILTIN_TYPE(pat)) {
5459       case T_REGEXP:
5460         return pat;
5461
5462       case T_STRING:
5463         break;
5464
5465       default:
5466         val = rb_check_string_type(pat);
5467         if (NIL_P(val)) {
5468             Check_Type(pat, T_REGEXP);
5469         }
5470         pat = val;
5471     }
5472     if (check && is_broken_string(pat)) {
5473         rb_exc_raise(rb_reg_check_preprocess(pat));
5474     }
5475     return pat;
5476 }
5477
5478 static long
5479 rb_pat_search(VALUE pat, VALUE str, long pos, int set_backref_str)
5480 {
5481     if (BUILTIN_TYPE(pat) == T_STRING) {
5482         pos = rb_strseq_index(str, pat, pos, 1);
5483         if (set_backref_str) {
5484             if (pos >= 0) {
5485                 str = rb_str_new_frozen_String(str);
5486                 rb_backref_set_string(str, pos, RSTRING_LEN(pat));
5487             }
5488             else {
5489                 rb_backref_set(Qnil);
5490             }
5491         }
5492         return pos;
5493     }
5494     else {
5495         return rb_reg_search0(pat, str, pos, 0, set_backref_str);
5496     }
5497 }
5498
5499
5500 /*
5501  *  call-seq:
5502  *    sub!(pattern, replacement)   -> self or nil
5503  *    sub!(pattern) {|match| ... } -> self or nil
5504  *
5505  *  Returns +self+ with only the first occurrence
5506  *  (not all occurrences) of the given +pattern+ replaced.
5507  *
5508  *  See {Substitution Methods}[#class-String-label-Substitution+Methods].
5509  *
5510  *  Related: String#sub, String#gsub, String#gsub!.
5511  *
5512  */
5513
5514 static VALUE
5515 rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
5516 {
5517     VALUE pat, repl, hash = Qnil;
5518     int iter = 0;
5519     long plen;
5520     int min_arity = rb_block_given_p() ? 1 : 2;
5521     long beg;
5522
5523     rb_check_arity(argc, min_arity, 2);
5524     if (argc == 1) {
5525         iter = 1;
5526     }
5527     else {
5528         repl = argv[1];
5529         hash = rb_check_hash_type(argv[1]);
5530         if (NIL_P(hash)) {
5531             StringValue(repl);
5532         }
5533     }
5534
5535     pat = get_pat_quoted(argv[0], 1);
5536
5537     str_modifiable(str);
5538     beg = rb_pat_search(pat, str, 0, 1);
5539     if (beg >= 0) {
5540         rb_encoding *enc;
5541         int cr = ENC_CODERANGE(str);
5542         long beg0, end0;
5543         VALUE match, match0 = Qnil;
5544         struct re_registers *regs;
5545         char *p, *rp;
5546         long len, rlen;
5547
5548         match = rb_backref_get();
5549         regs = RMATCH_REGS(match);
5550         if (RB_TYPE_P(pat, T_STRING)) {
5551             beg0 = beg;
5552             end0 = beg0 + RSTRING_LEN(pat);
5553             match0 = pat;
5554         }
5555         else {
5556             beg0 = BEG(0);
5557             end0 = END(0);
5558             if (iter) match0 = rb_reg_nth_match(0, match);
5559         }
5560
5561         if (iter || !NIL_P(hash)) {
5562             p = RSTRING_PTR(str); len = RSTRING_LEN(str);
5563
5564             if (iter) {
5565                 repl = rb_obj_as_string(rb_yield(match0));
5566             }
5567             else {
5568                 repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
5569                 repl = rb_obj_as_string(repl);
5570             }
5571             str_mod_check(str, p, len);
5572             rb_check_frozen(str);
5573         }
5574         else {
5575             repl = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
5576         }
5577
5578         enc = rb_enc_compatible(str, repl);
5579         if (!enc) {
5580             rb_encoding *str_enc = STR_ENC_GET(str);
5581             p = RSTRING_PTR(str); len = RSTRING_LEN(str);
5582             if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
5583                 coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
5584                 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
5585                          rb_enc_name(str_enc),
5586                          rb_enc_name(STR_ENC_GET(repl)));
5587             }
5588             enc = STR_ENC_GET(repl);
5589         }
5590         rb_str_modify(str);
5591         rb_enc_associate(str, enc);
5592         if (ENC_CODERANGE_UNKNOWN < cr && cr < ENC_CODERANGE_BROKEN) {
5593             int cr2 = ENC_CODERANGE(repl);
5594             if (cr2 == ENC_CODERANGE_BROKEN ||
5595                 (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
5596                 cr = ENC_CODERANGE_UNKNOWN;
5597             else
5598                 cr = cr2;
5599         }
5600         plen = end0 - beg0;
5601         rlen = RSTRING_LEN(repl);
5602         len = RSTRING_LEN(str);
5603         if (rlen > plen) {
5604             RESIZE_CAPA(str, len + rlen - plen);
5605         }
5606         p = RSTRING_PTR(str);
5607         if (rlen != plen) {
5608             memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
5609         }
5610         rp = RSTRING_PTR(repl);
5611         memmove(p + beg0, rp, rlen);
5612         len += rlen - plen;
5613         STR_SET_LEN(str, len);
5614         TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
5615         ENC_CODERANGE_SET(str, cr);
5616
5617         return str;
5618     }
5619     return Qnil;
5620 }
5621
5622
5623 /*
5624  *  call-seq:
5625  *    sub(pattern, replacement)   -> new_string
5626  *    sub(pattern) {|match| ... } -> new_string
5627  *
5628  *  Returns a copy of +self+ with only the first occurrence
5629  *  (not all occurrences) of the given +pattern+ replaced.
5630  *
5631  *  See {Substitution Methods}[#class-String-label-Substitution+Methods].
5632  *
5633  *  Related: String#sub!, String#gsub, String#gsub!.
5634  *
5635  */
5636
5637 static VALUE
5638 rb_str_sub(int argc, VALUE *argv, VALUE str)
5639 {
5640     str = str_duplicate(rb_cString, str);
5641     rb_str_sub_bang(argc, argv, str);
5642     return str;
5643 }
5644
5645 static VALUE
5646 str_gsub(int argc, VALUE *argv, VALUE str, int bang)
5647 {
5648     VALUE pat, val = Qnil, repl, match, match0 = Qnil, dest, hash = Qnil;
5649     struct re_registers *regs;
5650     long beg, beg0, end0;
5651     long offset, blen, slen, len, last;
5652     enum {STR, ITER, MAP} mode = STR;
5653     char *sp, *cp;
5654     int need_backref = -1;
5655     rb_encoding *str_enc;
5656
5657     switch (argc) {
5658       case 1:
5659         RETURN_ENUMERATOR(str, argc, argv);
5660         mode = ITER;
5661         break;
5662       case 2:
5663         repl = argv[1];
5664         hash = rb_check_hash_type(argv[1]);
5665         if (NIL_P(hash)) {
5666             StringValue(repl);
5667         }
5668         else {
5669             mode = MAP;
5670         }
5671         break;
5672       default:
5673         rb_error_arity(argc, 1, 2);
5674     }
5675
5676     pat = get_pat_quoted(argv[0], 1);
5677     beg = rb_pat_search(pat, str, 0, need_backref);
5678     if (beg < 0) {
5679         if (bang) return Qnil;  /* no match, no substitution */
5680         return str_duplicate(rb_cString, str);
5681     }
5682
5683     offset = 0;
5684     blen = RSTRING_LEN(str) + 30; /* len + margin */
5685     dest = rb_str_buf_new(blen);
5686     sp = RSTRING_PTR(str);
5687     slen = RSTRING_LEN(str);
5688     cp = sp;
5689     str_enc = STR_ENC_GET(str);
5690     rb_enc_associate(dest, str_enc);
5691     ENC_CODERANGE_SET(dest, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
5692
5693     do {
5694         match = rb_backref_get();
5695         regs = RMATCH_REGS(match);
5696         if (RB_TYPE_P(pat, T_STRING)) {
5697             beg0 = beg;
5698             end0 = beg0 + RSTRING_LEN(pat);
5699             match0 = pat;
5700         }
5701         else {
5702             beg0 = BEG(0);
5703             end0 = END(0);
5704             if (mode == ITER) match0 = rb_reg_nth_match(0, match);
5705         }
5706
5707         if (mode) {
5708             if (mode == ITER) {
5709                 val = rb_obj_as_string(rb_yield(match0));
5710             }
5711             else {
5712                 val = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
5713                 val = rb_obj_as_string(val);
5714             }
5715             str_mod_check(str, sp, slen);
5716             if (val == dest) {  /* paranoid check [ruby-dev:24827] */
5717                 rb_raise(rb_eRuntimeError, "block should not cheat");
5718             }
5719         }
5720         else if (need_backref) {
5721             val = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
5722             if (need_backref < 0) {
5723                 need_backref = val != repl;
5724             }
5725         }
5726         else {
5727             val = repl;
5728         }
5729
5730         len = beg0 - offset;    /* copy pre-match substr */
5731         if (len) {
5732             rb_enc_str_buf_cat(dest, cp, len, str_enc);
5733         }
5734
5735         rb_str_buf_append(dest, val);
5736
5737         last = offset;
5738         offset = end0;
5739         if (beg0 == end0) {
5740             /*
5741              * Always consume at least one character of the input string
5742              * in order to prevent infinite loops.
5743              */
5744             if (RSTRING_LEN(str) <= end0) break;
5745             len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
5746             rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
5747             offset = end0 + len;
5748         }
5749         cp = RSTRING_PTR(str) + offset;
5750         if (offset > RSTRING_LEN(str)) break;
5751         beg = rb_pat_search(pat, str, offset, need_backref);
5752     } while (beg >= 0);
5753     if (RSTRING_LEN(str) > offset) {
5754         rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
5755     }
5756     rb_pat_search(pat, str, last, 1);
5757     if (bang) {
5758         str_shared_replace(str, dest);
5759     }
5760     else {
5761         str = dest;
5762     }
5763
5764     return str;
5765 }
5766
5767
5768 /*
5769  *  call-seq:
5770  *     gsub!(pattern, replacement)   -> self or nil
5771  *     gsub!(pattern) {|match| ... } -> self or nil
5772  *     gsub!(pattern)                -> an_enumerator
5773  *
5774  *  Performs the specified substring replacement(s) on +self+;
5775  *  returns +self+ if any replacement occurred, +nil+ otherwise.
5776  *
5777  *  See {Substitution Methods}[#class-String-label-Substitution+Methods].
5778  *
5779  *  Returns an Enumerator if no +replacement+ and no block given.
5780  *
5781  *  Related: String#sub, String#gsub, String#sub!.
5782  *
5783  */
5784
5785 static VALUE
5786 rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
5787 {
5788     str_modify_keep_cr(str);
5789     return str_gsub(argc, argv, str, 1);
5790 }
5791
5792
5793 /*
5794  *  call-seq:
5795  *     gsub(pattern, replacement)   -> new_string
5796  *     gsub(pattern) {|match| ... } -> new_string
5797  *     gsub(pattern)                -> enumerator
5798  *
5799  *  Returns a copy of +self+ with all occurrences of the given +pattern+ replaced.
5800  *
5801  *  See {Substitution Methods}[#class-String-label-Substitution+Methods].
5802  *
5803  *  Returns an Enumerator if no +replacement+ and no block given.
5804  *
5805  *  Related: String#sub, String#sub!, String#gsub!.
5806  *
5807  */
5808
5809 static VALUE
5810 rb_str_gsub(int argc, VALUE *argv, VALUE str)
5811 {
5812     return str_gsub(argc, argv, str, 0);
5813 }
5814
5815
5816 /*
5817  *  call-seq:
5818  *    replace(other_string) -> self
5819  *
5820  *  Replaces the contents of +self+ with the contents of +other_string+:
5821  *
5822  *    s = 'foo'        # => "foo"
5823  *    s.replace('bar') # => "bar"
5824  *
5825  */
5826
5827 VALUE
5828 rb_str_replace(VALUE str, VALUE str2)
5829 {
5830     str_modifiable(str);
5831     if (str == str2) return str;
5832
5833     StringValue(str2);
5834     str_discard(str);
5835     return str_replace(str, str2);
5836 }
5837
5838 /*
5839  *  call-seq:
5840  *    clear -> self
5841  *
5842  *  Removes the contents of +self+:
5843  *
5844  *    s = 'foo' # => "foo"
5845  *    s.clear   # => ""
5846  *
5847  */
5848
5849 static VALUE
5850 rb_str_clear(VALUE str)
5851 {
5852     str_discard(str);
5853     STR_SET_EMBED(str);
5854     STR_SET_EMBED_LEN(str, 0);
5855     RSTRING_PTR(str)[0] = 0;
5856     if (rb_enc_asciicompat(STR_ENC_GET(str)))
5857         ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
5858     else
5859         ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
5860     return str;
5861 }
5862
5863 /*
5864  *  call-seq:
5865  *    chr -> string
5866  *
5867  *  Returns a string containing the first character of +self+:
5868  *
5869  *    s = 'foo' # => "foo"
5870  *    s.chr     # => "f"
5871  *
5872  */
5873
5874 static VALUE
5875 rb_str_chr(VALUE str)
5876 {
5877     return rb_str_substr(str, 0, 1);
5878 }
5879
5880 /*
5881  *  call-seq:
5882  *    getbyte(index) -> integer
5883  *
5884  *  Returns the byte at zero-based +index+ as an integer:
5885  *
5886  *    s = 'abcde'  # => "abcde"
5887  *    s.getbyte(0) # => 97
5888  *    s.getbyte(1) # => 98
5889  *
5890  *  Related: String#setbyte.
5891  */
5892 static VALUE
5893 rb_str_getbyte(VALUE str, VALUE index)
5894 {
5895     long pos = NUM2LONG(index);
5896
5897     if (pos < 0)
5898         pos += RSTRING_LEN(str);
5899     if (pos < 0 ||  RSTRING_LEN(str) <= pos)
5900         return Qnil;
5901
5902     return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
5903 }
5904
5905 /*
5906  *  call-seq:
5907  *    setbyte(index, integer) -> integer
5908  *
5909  *  Sets the byte at zero-based +index+ to +integer+; returns +integer+:
5910  *
5911  *    s = 'abcde'      # => "abcde"
5912  *    s.setbyte(0, 98) # => 98
5913  *    s                # => "bbcde"
5914  *
5915  *  Related: String#getbyte.
5916  */
5917 static VALUE
5918 rb_str_setbyte(VALUE str, VALUE index, VALUE value)
5919 {
5920     long pos = NUM2LONG(index);
5921     long len = RSTRING_LEN(str);
5922     char *ptr, *head, *left = 0;
5923     rb_encoding *enc;
5924     int cr = ENC_CODERANGE_UNKNOWN, width, nlen;
5925
5926     if (pos < -len || len <= pos)
5927         rb_raise(rb_eIndexError, "index %ld out of string", pos);
5928     if (pos < 0)
5929         pos += len;
5930
5931     VALUE v = rb_to_int(value);
5932     VALUE w = rb_int_and(v, INT2FIX(0xff));
5933     char byte = (char)(NUM2INT(w) & 0xFF);
5934
5935     if (!str_independent(str))
5936         str_make_independent(str);
5937     enc = STR_ENC_GET(str);
5938     head = RSTRING_PTR(str);
5939     ptr = &head[pos];
5940     if (!STR_EMBED_P(str)) {
5941         cr = ENC_CODERANGE(str);
5942         switch (cr) {
5943           case ENC_CODERANGE_7BIT:
5944             left = ptr;
5945             *ptr = byte;
5946             if (ISASCII(byte)) goto end;
5947             nlen = rb_enc_precise_mbclen(left, head+len, enc);
5948             if (!MBCLEN_CHARFOUND_P(nlen))
5949                 ENC_CODERANGE_SET(str, ENC_CODERANGE_BROKEN);
5950             else
5951                 ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
5952             goto end;
5953           case ENC_CODERANGE_VALID:
5954             left = rb_enc_left_char_head(head, ptr, head+len, enc);
5955             width = rb_enc_precise_mbclen(left, head+len, enc);
5956             *ptr = byte;
5957             nlen = rb_enc_precise_mbclen(left, head+len, enc);
5958             if (!MBCLEN_CHARFOUND_P(nlen))
5959                 ENC_CODERANGE_SET(str, ENC_CODERANGE_BROKEN);
5960             else if (MBCLEN_CHARFOUND_LEN(nlen) != width || ISASCII(byte))
5961                 ENC_CODERANGE_CLEAR(str);
5962             goto end;
5963         }
5964     }
5965     ENC_CODERANGE_CLEAR(str);
5966     *ptr = byte;
5967
5968   end:
5969     return value;
5970 }
5971
5972 static VALUE
5973 str_byte_substr(VALUE str, long beg, long len, int empty)
5974 {
5975     char *p, *s = RSTRING_PTR(str);
5976     long n = RSTRING_LEN(str);
5977     VALUE str2;
5978
5979     if (beg > n || len < 0) return Qnil;
5980     if (beg < 0) {
5981         beg += n;
5982         if (beg < 0) return Qnil;
5983     }
5984     if (len > n - beg)
5985         len = n - beg;
5986     if (len <= 0) {
5987         if (!empty) return Qnil;
5988         len = 0;
5989         p = 0;
5990     }
5991     else
5992         p = s + beg;
5993
5994     if (!STR_EMBEDDABLE_P(len, TERM_LEN(str)) && SHARABLE_SUBSTRING_P(beg, len, n)) {
5995         str2 = rb_str_new_frozen(str);
5996         str2 = str_new_shared(rb_cString, str2);
5997         RSTRING(str2)->as.heap.ptr += beg;
5998         RSTRING(str2)->as.heap.len = len;
5999     }
6000     else {
6001         str2 = rb_str_new(p, len);
6002     }
6003
6004     str_enc_copy(str2, str);
6005
6006     if (RSTRING_LEN(str2) == 0) {
6007         if (!rb_enc_asciicompat(STR_ENC_GET(str)))
6008             ENC_CODERANGE_SET(str2, ENC_CODERANGE_VALID);
6009         else
6010             ENC_CODERANGE_SET(str2, ENC_CODERANGE_7BIT);
6011     }
6012     else {
6013         switch (ENC_CODERANGE(str)) {
6014           case ENC_CODERANGE_7BIT:
6015             ENC_CODERANGE_SET(str2, ENC_CODERANGE_7BIT);
6016             break;
6017           default:
6018             ENC_CODERANGE_SET(str2, ENC_CODERANGE_UNKNOWN);
6019             break;
6020         }
6021     }
6022
6023     return str2;
6024 }
6025
6026 static VALUE
6027 str_byte_aref(VALUE str, VALUE indx)
6028 {
6029     long idx;
6030     if (FIXNUM_P(indx)) {
6031         idx = FIX2LONG(indx);
6032     }
6033     else {
6034         /* check if indx is Range */
6035         long beg, len = RSTRING_LEN(str);
6036
6037         switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
6038           case Qfalse:
6039             break;
6040           case Qnil:
6041             return Qnil;
6042           default:
6043             return str_byte_substr(str, beg, len, TRUE);
6044         }
6045
6046         idx = NUM2LONG(indx);
6047     }
6048     return str_byte_substr(str, idx, 1, FALSE);
6049 }
6050
6051 /*
6052  *  call-seq:
6053  *    byteslice(index, length = 1) -> string or nil
6054  *    byteslice(range)             -> string or nil
6055  *
6056  *  Returns a substring of +self+, or +nil+ if the substring cannot be constructed.
6057  *
6058  *  With integer arguments +index+ and +length+ given,
6059  *  returns the substring beginning at the given +index+
6060  *  of the given +length+ (if possible),
6061  *  or +nil+ if +length+ is negative or +index+ falls outside of +self+:
6062  *
6063  *    s = '0123456789' # => "0123456789"
6064  *    s.byteslice(2)   # => "2"
6065  *    s.byteslice(200) # => nil
6066  *    s.byteslice(4, 3)  # => "456"
6067  *    s.byteslice(4, 30) # => "456789"
6068  *    s.byteslice(4, -1) # => nil
6069  *    s.byteslice(40, 2) # => nil
6070  *
6071  *  In either case above, counts backwards from the end of +self+
6072  *  if +index+ is negative:
6073  *
6074  *    s = '0123456789'   # => "0123456789"
6075  *    s.byteslice(-4)    # => "6"
6076  *    s.byteslice(-4, 3) # => "678"
6077  *
6078  *  With Range argument +range+ given, returns
6079  *  <tt>byteslice(range.begin, range.size)</tt>:
6080  *
6081  *    s = '0123456789'    # => "0123456789"
6082  *    s.byteslice(4..6)   # => "456"
6083  *    s.byteslice(-6..-4) # => "456"
6084  *    s.byteslice(5..2)   # => "" # range.size is zero.
6085  *    s.byteslice(40..42) # => nil
6086  *
6087  *  In all cases, a returned string has the same encoding as +self+:
6088  *
6089  *    s.encoding              # => #<Encoding:UTF-8>
6090  *    s.byteslice(4).encoding # => #<Encoding:UTF-8>
6091  *
6092  */
6093
6094 static VALUE
6095 rb_str_byteslice(int argc, VALUE *argv, VALUE str)
6096 {
6097     if (argc == 2) {
6098         long beg = NUM2LONG(argv[0]);
6099         long end = NUM2LONG(argv[1]);
6100         return str_byte_substr(str, beg, end, TRUE);
6101     }
6102     rb_check_arity(argc, 1, 2);
6103     return str_byte_aref(str, argv[0]);
6104 }
6105
6106 /*
6107  *  call-seq:
6108  *    reverse -> string
6109  *
6110  *  Returns a new string with the characters from +self+ in reverse order.
6111  *
6112  *    'stressed'.reverse # => "desserts"
6113  *
6114  */
6115
6116 static VALUE
6117 rb_str_reverse(VALUE str)
6118 {
6119     rb_encoding *enc;
6120     VALUE rev;
6121     char *s, *e, *p;
6122     int cr;
6123
6124     if (RSTRING_LEN(str) <= 1) return str_duplicate(rb_cString, str);
6125     enc = STR_ENC_GET(str);
6126     rev = rb_str_new(0, RSTRING_LEN(str));
6127     s = RSTRING_PTR(str); e = RSTRING_END(str);
6128     p = RSTRING_END(rev);
6129     cr = ENC_CODERANGE(str);
6130
6131     if (RSTRING_LEN(str) > 1) {
6132         if (single_byte_optimizable(str)) {
6133             while (s < e) {
6134                 *--p = *s++;
6135             }
6136         }
6137         else if (cr == ENC_CODERANGE_VALID) {
6138             while (s < e) {
6139                 int clen = rb_enc_fast_mbclen(s, e, enc);
6140
6141                 p -= clen;
6142                 memcpy(p, s, clen);
6143                 s += clen;
6144             }
6145         }
6146         else {
6147             cr = rb_enc_asciicompat(enc) ?
6148                 ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
6149             while (s < e) {
6150                 int clen = rb_enc_mbclen(s, e, enc);
6151
6152                 if (clen > 1 || (*s & 0x80)) cr = ENC_CODERANGE_UNKNOWN;
6153                 p -= clen;
6154                 memcpy(p, s, clen);
6155                 s += clen;
6156             }
6157         }
6158     }
6159     STR_SET_LEN(rev, RSTRING_LEN(str));
6160     str_enc_copy(rev, str);
6161     ENC_CODERANGE_SET(rev, cr);
6162
6163     return rev;
6164 }
6165
6166
6167 /*
6168  *  call-seq:
6169  *    reverse! -> self
6170  *
6171  *  Returns +self+ with its characters reversed:
6172  *
6173  *    s = 'stressed'
6174  *    s.reverse! # => "desserts"
6175  *    s          # => "desserts"
6176  *
6177  */
6178
6179 static VALUE
6180 rb_str_reverse_bang(VALUE str)
6181 {
6182     if (RSTRING_LEN(str) > 1) {
6183         if (single_byte_optimizable(str)) {
6184             char *s, *e, c;
6185
6186             str_modify_keep_cr(str);
6187             s = RSTRING_PTR(str);
6188             e = RSTRING_END(str) - 1;
6189             while (s < e) {
6190                 c = *s;
6191                 *s++ = *e;
6192                 *e-- = c;
6193             }
6194         }
6195         else {
6196             str_shared_replace(str, rb_str_reverse(str));
6197         }
6198     }
6199     else {
6200         str_modify_keep_cr(str);
6201     }
6202     return str;
6203 }
6204
6205
6206 /*
6207  *  call-seq:
6208  *    include? other_string -> true or false
6209  *
6210  *  Returns +true+ if +self+ contains +other_string+, +false+ otherwise:
6211  *
6212  *    s = 'foo'
6213  *    s.include?('f')    # => true
6214  *    s.include?('fo')   # => true
6215  *    s.include?('food') # => false
6216  *
6217  */
6218
6219 static VALUE
6220 rb_str_include(VALUE str, VALUE arg)
6221 {
6222     long i;
6223
6224     StringValue(arg);
6225     i = rb_str_index(str, arg, 0);
6226
6227     return RBOOL(i != -1);
6228 }
6229
6230
6231 /*
6232  *  call-seq:
6233  *    to_i(base = 10) -> integer
6234  *
6235  *  Returns the result of interpreting leading characters in +self+
6236  *  as an integer in the given +base+ (which must be in (2..36)):
6237  *
6238  *    '123456'.to_i     # => 123456
6239  *    '123def'.to_i(16) # => 1195503
6240  *
6241  *  Characters past a leading valid number (in the given +base+) are ignored:
6242  *
6243  *    '12.345'.to_i   # => 12
6244  *    '12345'.to_i(2) # => 1
6245  *
6246  *  Returns zero if there is no leading valid number:
6247  *
6248  *    'abcdef'.to_i # => 0
6249  *    '2'.to_i(2)   # => 0
6250  *
6251  */
6252
6253 static VALUE
6254 rb_str_to_i(int argc, VALUE *argv, VALUE str)
6255 {
6256     int base = 10;
6257
6258     if (rb_check_arity(argc, 0, 1) && (base = NUM2INT(argv[0])) < 0) {
6259         rb_raise(rb_eArgError, "invalid radix %d", base);
6260     }
6261     return rb_str_to_inum(str, base, FALSE);
6262 }
6263
6264
6265 /*
6266  *  call-seq:
6267  *    to_f -> float
6268  *
6269  *  Returns the result of interpreting leading characters in +self+ as a Float:
6270  *
6271  *    '3.14159'.to_f  # => 3.14159
6272       '1.234e-2'.to_f # => 0.01234
6273  *
6274  *  Characters past a leading valid number (in the given +base+) are ignored:
6275  *
6276  *    '3.14 (pi to two places)'.to_f # => 3.14
6277  *
6278  *  Returns zero if there is no leading valid number:
6279  *
6280  *    'abcdef'.to_f # => 0.0
6281  *
6282  */
6283
6284 static VALUE
6285 rb_str_to_f(VALUE str)
6286 {
6287     return DBL2NUM(rb_str_to_dbl(str, FALSE));
6288 }
6289
6290
6291 /*
6292  *  call-seq:
6293  *    to_s -> self or string
6294  *
6295  *  Returns +self+ if +self+ is a \String,
6296  *  or +self+ converted to a \String if +self+ is a subclass of \String.
6297  *
6298  *  String#to_str is an alias for String#to_s.
6299  *
6300  */
6301
6302 static VALUE
6303 rb_str_to_s(VALUE str)
6304 {
6305     if (rb_obj_class(str) != rb_cString) {
6306         return str_duplicate(rb_cString, str);
6307     }
6308     return str;
6309 }
6310
6311 #if 0
6312 static void
6313 str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
6314 {
6315     char s[RUBY_MAX_CHAR_LEN];
6316     int n = rb_enc_codelen(c, enc);
6317
6318     rb_enc_mbcput(c, s, enc);
6319     rb_enc_str_buf_cat(str, s, n, enc);
6320 }
6321 #endif
6322
6323 #define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */
6324
6325 int
6326 rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
6327 {
6328     char buf[CHAR_ESC_LEN + 1];
6329     int l;
6330
6331 #if SIZEOF_INT > 4
6332     c &= 0xffffffff;
6333 #endif
6334     if (unicode_p) {
6335         if (c < 0x7F && ISPRINT(c)) {
6336             snprintf(buf, CHAR_ESC_LEN, "%c", c);
6337         }
6338         else if (c < 0x10000) {
6339             snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
6340         }
6341         else {
6342             snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
6343         }
6344     }
6345     else {
6346         if (c < 0x100) {
6347             snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
6348         }
6349         else {
6350             snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
6351         }
6352     }
6353     l = (int)strlen(buf);       /* CHAR_ESC_LEN cannot exceed INT_MAX */
6354     rb_str_buf_cat(result, buf, l);
6355     return l;
6356 }
6357
6358 const char *
6359 ruby_escaped_char(int c)
6360 {
6361     switch (c) {
6362       case '\0': return "\\0";
6363       case '\n': return "\\n";
6364       case '\r': return "\\r";
6365       case '\t': return "\\t";
6366       case '\f': return "\\f";
6367       case '\013': return "\\v";
6368       case '\010': return "\\b";
6369       case '\007': return "\\a";
6370       case '\033': return "\\e";
6371       case '\x7f': return "\\c?";
6372     }
6373     return NULL;
6374 }
6375
6376 VALUE
6377 rb_str_escape(VALUE str)
6378 {
6379     int encidx = ENCODING_GET(str);
6380     rb_encoding *enc = rb_enc_from_index(encidx);
6381     const char *p = RSTRING_PTR(str);
6382     const char *pend = RSTRING_END(str);
6383     const char *prev = p;
6384     char buf[CHAR_ESC_LEN + 1];
6385     VALUE result = rb_str_buf_new(0);
6386     int unicode_p = rb_enc_unicode_p(enc);
6387     int asciicompat = rb_enc_asciicompat(enc);
6388
6389     while (p < pend) {
6390         unsigned int c;
6391         const char *cc;
6392         int n = rb_enc_precise_mbclen(p, pend, enc);
6393         if (!MBCLEN_CHARFOUND_P(n)) {
6394             if (p > prev) str_buf_cat(result, prev, p - prev);
6395             n = rb_enc_mbminlen(enc);
6396             if (pend < p + n)
6397                 n = (int)(pend - p);
6398             while (n--) {
6399                 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
6400                 str_buf_cat(result, buf, strlen(buf));
6401                 prev = ++p;
6402             }
6403             continue;
6404         }
6405         n = MBCLEN_CHARFOUND_LEN(n);
6406         c = rb_enc_mbc_to_codepoint(p, pend, enc);
6407         p += n;
6408         cc = ruby_escaped_char(c);
6409         if (cc) {
6410             if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6411             str_buf_cat(result, cc, strlen(cc));
6412             prev = p;
6413         }
6414         else if (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c)) {
6415         }
6416         else {
6417             if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6418             rb_str_buf_cat_escaped_char(result, c, unicode_p);
6419             prev = p;
6420         }
6421     }
6422     if (p > prev) str_buf_cat(result, prev, p - prev);
6423     ENCODING_CODERANGE_SET(result, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
6424
6425     return result;
6426 }
6427
6428 /*
6429  *  call-seq:
6430  *    inspect -> string
6431  *
6432  *  Returns a printable version of +self+, enclosed in double-quotes,
6433  *  and with special characters escaped:
6434  *
6435  *    s = "foo\tbar\tbaz\n"
6436  *    # => "foo\tbar\tbaz\n"
6437  *    s.inspect
6438  *    # => "\"foo\\tbar\\tbaz\\n\""
6439  *
6440  */
6441
6442 VALUE
6443 rb_str_inspect(VALUE str)
6444 {
6445     int encidx = ENCODING_GET(str);
6446     rb_encoding *enc = rb_enc_from_index(encidx), *actenc;
6447     const char *p, *pend, *prev;
6448     char buf[CHAR_ESC_LEN + 1];
6449     VALUE result = rb_str_buf_new(0);
6450     rb_encoding *resenc = rb_default_internal_encoding();
6451     int unicode_p = rb_enc_unicode_p(enc);
6452     int asciicompat = rb_enc_asciicompat(enc);
6453
6454     if (resenc == NULL) resenc = rb_default_external_encoding();
6455     if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
6456     rb_enc_associate(result, resenc);
6457     str_buf_cat2(result, "\"");
6458
6459     p = RSTRING_PTR(str); pend = RSTRING_END(str);
6460     prev = p;
6461     actenc = get_actual_encoding(encidx, str);
6462     if (actenc != enc) {
6463         enc = actenc;
6464         if (unicode_p) unicode_p = rb_enc_unicode_p(enc);
6465     }
6466     while (p < pend) {
6467         unsigned int c, cc;
6468         int n;
6469
6470         n = rb_enc_precise_mbclen(p, pend, enc);
6471         if (!MBCLEN_CHARFOUND_P(n)) {
6472             if (p > prev) str_buf_cat(result, prev, p - prev);
6473             n = rb_enc_mbminlen(enc);
6474             if (pend < p + n)
6475                 n = (int)(pend - p);
6476             while (n--) {
6477                 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
6478                 str_buf_cat(result, buf, strlen(buf));
6479                 prev = ++p;
6480             }
6481             continue;
6482         }
6483         n = MBCLEN_CHARFOUND_LEN(n);
6484         c = rb_enc_mbc_to_codepoint(p, pend, enc);
6485         p += n;
6486         if ((asciicompat || unicode_p) &&
6487           (c == '"'|| c == '\\' ||
6488             (c == '#' &&
6489              p < pend &&
6490              MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p,pend,enc)) &&
6491              (cc = rb_enc_codepoint(p,pend,enc),
6492               (cc == '$' || cc == '@' || cc == '{'))))) {
6493             if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6494             str_buf_cat2(result, "\\");
6495             if (asciicompat || enc == resenc) {
6496                 prev = p - n;
6497                 continue;
6498             }
6499         }
6500         switch (c) {
6501           case '\n': cc = 'n'; break;
6502           case '\r': cc = 'r'; break;
6503           case '\t': cc = 't'; break;
6504           case '\f': cc = 'f'; break;
6505           case '\013': cc = 'v'; break;
6506           case '\010': cc = 'b'; break;
6507           case '\007': cc = 'a'; break;
6508           case 033: cc = 'e'; break;
6509           default: cc = 0; break;
6510         }
6511         if (cc) {
6512             if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6513             buf[0] = '\\';
6514             buf[1] = (char)cc;
6515             str_buf_cat(result, buf, 2);
6516             prev = p;
6517             continue;
6518         }
6519         if ((enc == resenc && rb_enc_isprint(c, enc)) ||
6520             (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
6521             continue;
6522         }
6523         else {
6524             if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6525             rb_str_buf_cat_escaped_char(result, c, unicode_p);
6526             prev = p;
6527             continue;
6528         }
6529     }
6530     if (p > prev) str_buf_cat(result, prev, p - prev);
6531     str_buf_cat2(result, "\"");
6532
6533     return result;
6534 }
6535
6536 #define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
6537
6538 /*
6539  *  call-seq:
6540  *    dump -> string
6541  *
6542  *  Returns a printable version of +self+, enclosed in double-quotes,
6543  *  with special characters escaped, and with non-printing characters
6544  *  replaced by hexadecimal notation:
6545  *
6546  *    "hello \n ''".dump    # => "\"hello \\n ''\""
6547  *    "\f\x00\xff\\\"".dump # => "\"\\f\\x00\\xFF\\\\\\\"\""
6548  *
6549  *  Related: String#undump (inverse of String#dump).
6550  *
6551  */
6552
6553 VALUE
6554 rb_str_dump(VALUE str)
6555 {
6556     int encidx = rb_enc_get_index(str);
6557     rb_encoding *enc = rb_enc_from_index(encidx);
6558     long len;
6559     const char *p, *pend;
6560     char *q, *qend;
6561     VALUE result;
6562     int u8 = (encidx == rb_utf8_encindex());
6563     static const char nonascii_suffix[] = ".dup.force_encoding(\"%s\")";
6564
6565     len = 2;                    /* "" */
6566     if (!rb_enc_asciicompat(enc)) {
6567         len += strlen(nonascii_suffix) - rb_strlen_lit("%s");
6568         len += strlen(enc->name);
6569     }
6570
6571     p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
6572     while (p < pend) {
6573         int clen;
6574         unsigned char c = *p++;
6575
6576         switch (c) {
6577           case '"':  case '\\':
6578           case '\n': case '\r':
6579           case '\t': case '\f':
6580           case '\013': case '\010': case '\007': case '\033':
6581             clen = 2;
6582             break;
6583
6584           case '#':
6585             clen = IS_EVSTR(p, pend) ? 2 : 1;
6586             break;
6587
6588           default:
6589             if (ISPRINT(c)) {
6590                 clen = 1;
6591             }
6592             else {
6593                 if (u8 && c > 0x7F) {   /* \u notation */
6594                     int n = rb_enc_precise_mbclen(p-1, pend, enc);
6595                     if (MBCLEN_CHARFOUND_P(n)) {
6596                         unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
6597                         if (cc <= 0xFFFF)
6598                             clen = 6;  /* \uXXXX */
6599                         else if (cc <= 0xFFFFF)
6600                             clen = 9;  /* \u{XXXXX} */
6601                         else
6602                             clen = 10; /* \u{XXXXXX} */
6603                         p += MBCLEN_CHARFOUND_LEN(n)-1;
6604                         break;
6605                     }
6606                 }
6607                 clen = 4;       /* \xNN */
6608             }
6609             break;
6610         }
6611
6612         if (clen > LONG_MAX - len) {
6613             rb_raise(rb_eRuntimeError, "string size too big");
6614         }
6615         len += clen;
6616     }
6617
6618     result = rb_str_new(0, len);
6619     p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
6620     q = RSTRING_PTR(result); qend = q + len + 1;
6621
6622     *q++ = '"';
6623     while (p < pend) {
6624         unsigned char c = *p++;
6625
6626         if (c == '"' || c == '\\') {
6627             *q++ = '\\';
6628             *q++ = c;
6629         }
6630         else if (c == '#') {
6631             if (IS_EVSTR(p, pend)) *q++ = '\\';
6632             *q++ = '#';
6633         }
6634         else if (c == '\n') {
6635             *q++ = '\\';
6636             *q++ = 'n';
6637         }
6638         else if (c == '\r') {
6639             *q++ = '\\';
6640             *q++ = 'r';
6641         }
6642         else if (c == '\t') {
6643             *q++ = '\\';
6644             *q++ = 't';
6645         }
6646         else if (c == '\f') {
6647             *q++ = '\\';
6648             *q++ = 'f';
6649         }
6650         else if (c == '\013') {
6651             *q++ = '\\';
6652             *q++ = 'v';
6653         }
6654         else if (c == '\010') {
6655             *q++ = '\\';
6656             *q++ = 'b';
6657         }
6658         else if (c == '\007') {
6659             *q++ = '\\';
6660             *q++ = 'a';
6661         }
6662         else if (c == '\033') {
6663             *q++ = '\\';
6664             *q++ = 'e';
6665         }
6666         else if (ISPRINT(c)) {
6667             *q++ = c;
6668         }
6669         else {
6670             *q++ = '\\';
6671             if (u8) {
6672                 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
6673                 if (MBCLEN_CHARFOUND_P(n)) {
6674                     int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
6675                     p += n;
6676                     if (cc <= 0xFFFF)
6677                         snprintf(q, qend-q, "u%04X", cc);    /* \uXXXX */
6678                     else
6679                         snprintf(q, qend-q, "u{%X}", cc);  /* \u{XXXXX} or \u{XXXXXX} */
6680                     q += strlen(q);
6681                     continue;
6682                 }
6683             }
6684             snprintf(q, qend-q, "x%02X", c);
6685             q += 3;
6686         }
6687     }
6688     *q++ = '"';
6689     *q = '\0';
6690     if (!rb_enc_asciicompat(enc)) {
6691         snprintf(q, qend-q, nonascii_suffix, enc->name);
6692         encidx = rb_ascii8bit_encindex();
6693     }
6694     /* result from dump is ASCII */
6695     rb_enc_associate_index(result, encidx);
6696     ENC_CODERANGE_SET(result, ENC_CODERANGE_7BIT);
6697     return result;
6698 }
6699
6700 static int
6701 unescape_ascii(unsigned int c)
6702 {
6703     switch (c) {
6704       case 'n':
6705         return '\n';
6706       case 'r':
6707         return '\r';
6708       case 't':
6709         return '\t';
6710       case 'f':
6711         return '\f';
6712       case 'v':
6713         return '\13';
6714       case 'b':
6715         return '\010';
6716       case 'a':
6717         return '\007';
6718       case 'e':
6719         return 033;
6720     }
6721     UNREACHABLE_RETURN(-1);
6722 }
6723
6724 static void
6725 undump_after_backslash(VALUE undumped, const char **ss, const char *s_end, rb_encoding **penc, bool *utf8, bool *binary)
6726 {
6727     const char *s = *ss;
6728     unsigned int c;
6729     int codelen;
6730     size_t hexlen;
6731     unsigned char buf[6];
6732     static rb_encoding *enc_utf8 = NULL;
6733
6734     switch (*s) {
6735       case '\\':
6736       case '"':
6737       case '#':
6738         rb_str_cat(undumped, s, 1); /* cat itself */
6739         s++;
6740         break;
6741       case 'n':
6742       case 'r':
6743       case 't':
6744       case 'f':
6745       case 'v':
6746       case 'b':
6747       case 'a':
6748       case 'e':
6749         *buf = unescape_ascii(*s);
6750         rb_str_cat(undumped, (char *)buf, 1);
6751         s++;
6752         break;
6753       case 'u':
6754         if (*binary) {
6755             rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
6756         }
6757         *utf8 = true;
6758         if (++s >= s_end) {
6759             rb_raise(rb_eRuntimeError, "invalid Unicode escape");
6760         }
6761         if (enc_utf8 == NULL) enc_utf8 = rb_utf8_encoding();
6762         if (*penc != enc_utf8) {
6763             *penc = enc_utf8;
6764             rb_enc_associate(undumped, enc_utf8);
6765         }
6766         if (*s == '{') { /* handle \u{...} form */
6767             s++;
6768             for (;;) {
6769                 if (s >= s_end) {
6770                     rb_raise(rb_eRuntimeError, "unterminated Unicode escape");
6771                 }
6772                 if (*s == '}') {
6773                     s++;
6774                     break;
6775                 }
6776                 if (ISSPACE(*s)) {
6777                     s++;
6778                     continue;
6779                 }
6780                 c = scan_hex(s, s_end-s, &hexlen);
6781                 if (hexlen == 0 || hexlen > 6) {
6782                     rb_raise(rb_eRuntimeError, "invalid Unicode escape");
6783                 }
6784                 if (c > 0x10ffff) {
6785                     rb_raise(rb_eRuntimeError, "invalid Unicode codepoint (too large)");
6786                 }
6787                 if (0xd800 <= c && c <= 0xdfff) {
6788                     rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
6789                 }
6790                 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
6791                 rb_str_cat(undumped, (char *)buf, codelen);
6792                 s += hexlen;
6793             }
6794         }
6795         else { /* handle \uXXXX form */
6796             c = scan_hex(s, 4, &hexlen);
6797             if (hexlen != 4) {
6798                 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
6799             }
6800             if (0xd800 <= c && c <= 0xdfff) {
6801                 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
6802             }
6803             codelen = rb_enc_mbcput(c, (char *)buf, *penc);
6804             rb_str_cat(undumped, (char *)buf, codelen);
6805             s += hexlen;
6806         }
6807         break;
6808       case 'x':
6809         if (*utf8) {
6810             rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
6811         }
6812         *binary = true;
6813         if (++s >= s_end) {
6814             rb_raise(rb_eRuntimeError, "invalid hex escape");
6815         }
6816         *buf = scan_hex(s, 2, &hexlen);
6817         if (hexlen != 2) {
6818             rb_raise(rb_eRuntimeError, "invalid hex escape");
6819         }
6820         rb_str_cat(undumped, (char *)buf, 1);
6821         s += hexlen;
6822         break;
6823       default:
6824         rb_str_cat(undumped, s-1, 2);
6825         s++;
6826     }
6827
6828     *ss = s;
6829 }
6830
6831 static VALUE rb_str_is_ascii_only_p(VALUE str);
6832
6833 /*
6834  *  call-seq:
6835  *    undump -> string
6836  *
6837  *  Returns an unescaped version of +self+:
6838  *
6839  *    s_orig = "\f\x00\xff\\\""    # => "\f\u0000\xFF\\\""
6840  *    s_dumped = s_orig.dump       # => "\"\\f\\x00\\xFF\\\\\\\"\""
6841  *    s_undumped = s_dumped.undump # => "\f\u0000\xFF\\\""
6842  *    s_undumped == s_orig         # => true
6843  *
6844  *  Related: String#dump (inverse of String#undump).
6845  *
6846  */
6847
6848 static VALUE
6849 str_undump(VALUE str)
6850 {
6851     const char *s = RSTRING_PTR(str);
6852     const char *s_end = RSTRING_END(str);
6853     rb_encoding *enc = rb_enc_get(str);
6854     VALUE undumped = rb_enc_str_new(s, 0L, enc);
6855     bool utf8 = false;
6856     bool binary = false;
6857     int w;
6858
6859     rb_must_asciicompat(str);
6860     if (rb_str_is_ascii_only_p(str) == Qfalse) {
6861         rb_raise(rb_eRuntimeError, "non-ASCII character detected");
6862     }
6863     if (!str_null_check(str, &w)) {
6864         rb_raise(rb_eRuntimeError, "string contains null byte");
6865     }
6866     if (RSTRING_LEN(str) < 2) goto invalid_format;
6867     if (*s != '"') goto invalid_format;
6868
6869     /* strip '"' at the start */
6870     s++;
6871
6872     for (;;) {
6873         if (s >= s_end) {
6874             rb_raise(rb_eRuntimeError, "unterminated dumped string");
6875         }
6876
6877         if (*s == '"') {
6878             /* epilogue */
6879             s++;
6880             if (s == s_end) {
6881                 /* ascii compatible dumped string */
6882                 break;
6883             }
6884             else {
6885                 static const char force_encoding_suffix[] = ".force_encoding(\""; /* "\")" */
6886                 static const char dup_suffix[] = ".dup";
6887                 const char *encname;
6888                 int encidx;
6889                 ptrdiff_t size;
6890
6891                 /* check separately for strings dumped by older versions */
6892                 size = sizeof(dup_suffix) - 1;
6893                 if (s_end - s > size && memcmp(s, dup_suffix, size) == 0) s += size;
6894
6895                 size = sizeof(force_encoding_suffix) - 1;
6896                 if (s_end - s <= size) goto invalid_format;
6897                 if (memcmp(s, force_encoding_suffix, size) != 0) goto invalid_format;
6898                 s += size;
6899
6900                 if (utf8) {
6901                     rb_raise(rb_eRuntimeError, "dumped string contained Unicode escape but used force_encoding");
6902                 }
6903
6904                 encname = s;
6905                 s = memchr(s, '"', s_end-s);
6906                 size = s - encname;
6907                 if (!s) goto invalid_format;
6908                 if (s_end - s != 2) goto invalid_format;
6909                 if (s[0] != '"' || s[1] != ')') goto invalid_format;
6910
6911                 encidx = rb_enc_find_index2(encname, (long)size);
6912                 if (encidx < 0) {
6913                     rb_raise(rb_eRuntimeError, "dumped string has unknown encoding name");
6914                 }
6915                 rb_enc_associate_index(undumped, encidx);
6916             }
6917             break;
6918         }
6919
6920         if (*s == '\\') {
6921             s++;
6922             if (s >= s_end) {
6923                 rb_raise(rb_eRuntimeError, "invalid escape");
6924             }
6925             undump_after_backslash(undumped, &s, s_end, &enc, &utf8, &binary);
6926         }
6927         else {
6928             rb_str_cat(undumped, s++, 1);
6929         }
6930     }
6931
6932     return undumped;
6933 invalid_format:
6934     rb_raise(rb_eRuntimeError, "invalid dumped string; not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form");
6935 }
6936
6937 static void
6938 rb_str_check_dummy_enc(rb_encoding *enc)
6939 {
6940     if (rb_enc_dummy_p(enc)) {
6941         rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
6942                  rb_enc_name(enc));
6943     }
6944 }
6945
6946 static rb_encoding *
6947 str_true_enc(VALUE str)
6948 {
6949     rb_encoding *enc = STR_ENC_GET(str);
6950     rb_str_check_dummy_enc(enc);
6951     return enc;
6952 }
6953
6954 static OnigCaseFoldType
6955 check_case_options(int argc, VALUE *argv, OnigCaseFoldType flags)
6956 {
6957     if (argc==0)
6958         return flags;
6959     if (argc>2)
6960         rb_raise(rb_eArgError, "too many options");
6961     if (argv[0]==sym_turkic) {
6962         flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
6963         if (argc==2) {
6964             if (argv[1]==sym_lithuanian)
6965                 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
6966             else
6967                 rb_raise(rb_eArgError, "invalid second option");
6968         }
6969     }
6970     else if (argv[0]==sym_lithuanian) {
6971         flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
6972         if (argc==2) {
6973             if (argv[1]==sym_turkic)
6974                 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
6975             else
6976                 rb_raise(rb_eArgError, "invalid second option");
6977         }
6978     }
6979     else if (argc>1)
6980         rb_raise(rb_eArgError, "too many options");
6981     else if (argv[0]==sym_ascii)
6982         flags |= ONIGENC_CASE_ASCII_ONLY;
6983     else if (argv[0]==sym_fold) {
6984         if ((flags & (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE)) == ONIGENC_CASE_DOWNCASE)
6985             flags ^= ONIGENC_CASE_FOLD|ONIGENC_CASE_DOWNCASE;
6986         else
6987             rb_raise(rb_eArgError, "option :fold only allowed for downcasing");
6988     }
6989     else
6990         rb_raise(rb_eArgError, "invalid option");
6991     return flags;
6992 }
6993
6994 static inline bool
6995 case_option_single_p(OnigCaseFoldType flags, rb_encoding *enc, VALUE str)
6996 {
6997     if ((flags & ONIGENC_CASE_ASCII_ONLY) && (enc==rb_utf8_encoding() || rb_enc_mbmaxlen(enc) == 1))
6998         return true;
6999     return !(flags & ONIGENC_CASE_FOLD_TURKISH_AZERI) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT;
7000 }
7001
7002 /* 16 should be long enough to absorb any kind of single character length increase */
7003 #define CASE_MAPPING_ADDITIONAL_LENGTH 20
7004 #ifndef CASEMAP_DEBUG
7005 # define CASEMAP_DEBUG 0
7006 #endif
7007
7008 struct mapping_buffer;
7009 typedef struct mapping_buffer {
7010     size_t capa;
7011     size_t used;
7012     struct mapping_buffer *next;
7013     OnigUChar space[FLEX_ARY_LEN];
7014 } mapping_buffer;
7015
7016 static void
7017 mapping_buffer_free(void *p)
7018 {
7019     mapping_buffer *previous_buffer;
7020     mapping_buffer *current_buffer = p;
7021     while (current_buffer) {
7022         previous_buffer = current_buffer;
7023         current_buffer  = current_buffer->next;
7024         ruby_sized_xfree(previous_buffer, previous_buffer->capa);
7025     }
7026 }
7027
7028 static const rb_data_type_t mapping_buffer_type = {
7029     "mapping_buffer",
7030     {0, mapping_buffer_free,}
7031 };
7032
7033 static VALUE
7034 rb_str_casemap(VALUE source, OnigCaseFoldType *flags, rb_encoding *enc)
7035 {
7036     VALUE target;
7037
7038     const OnigUChar *source_current, *source_end;
7039     int target_length = 0;
7040     VALUE buffer_anchor;
7041     mapping_buffer *current_buffer = 0;
7042     mapping_buffer **pre_buffer;
7043     size_t buffer_count = 0;
7044     int buffer_length_or_invalid;
7045
7046     if (RSTRING_LEN(source) == 0) return str_duplicate(rb_cString, source);
7047
7048     source_current = (OnigUChar*)RSTRING_PTR(source);
7049     source_end = (OnigUChar*)RSTRING_END(source);
7050
7051     buffer_anchor = TypedData_Wrap_Struct(0, &mapping_buffer_type, 0);
7052     pre_buffer = (mapping_buffer **)&DATA_PTR(buffer_anchor);
7053     while (source_current < source_end) {
7054         /* increase multiplier using buffer count to converge quickly */
7055         size_t capa = (size_t)(source_end-source_current)*++buffer_count + CASE_MAPPING_ADDITIONAL_LENGTH;
7056         if (CASEMAP_DEBUG) {
7057             fprintf(stderr, "Buffer allocation, capa is %"PRIuSIZE"\n", capa); /* for tuning */
7058         }
7059         current_buffer = xmalloc(offsetof(mapping_buffer, space) + capa);
7060         *pre_buffer = current_buffer;
7061         pre_buffer = &current_buffer->next;
7062         current_buffer->next = NULL;
7063         current_buffer->capa = capa;
7064         buffer_length_or_invalid = enc->case_map(flags,
7065                                    &source_current, source_end,
7066                                    current_buffer->space,
7067                                    current_buffer->space+current_buffer->capa,
7068                                    enc);
7069         if (buffer_length_or_invalid < 0) {
7070             current_buffer = DATA_PTR(buffer_anchor);
7071             DATA_PTR(buffer_anchor) = 0;
7072             mapping_buffer_free(current_buffer);
7073             rb_raise(rb_eArgError, "input string invalid");
7074         }
7075         target_length  += current_buffer->used = buffer_length_or_invalid;
7076     }
7077     if (CASEMAP_DEBUG) {
7078         fprintf(stderr, "Buffer count is %"PRIuSIZE"\n", buffer_count); /* for tuning */
7079     }
7080
7081     if (buffer_count==1) {
7082         target = rb_str_new((const char*)current_buffer->space, target_length);
7083     }
7084     else {
7085         char *target_current;
7086
7087         target = rb_str_new(0, target_length);
7088         target_current = RSTRING_PTR(target);
7089         current_buffer = DATA_PTR(buffer_anchor);
7090         while (current_buffer) {
7091             memcpy(target_current, current_buffer->space, current_buffer->used);
7092             target_current += current_buffer->used;
7093             current_buffer  = current_buffer->next;
7094         }
7095     }
7096     current_buffer = DATA_PTR(buffer_anchor);
7097     DATA_PTR(buffer_anchor) = 0;
7098     mapping_buffer_free(current_buffer);
7099
7100     /* TODO: check about string terminator character */
7101     str_enc_copy(target, source);
7102     /*ENC_CODERANGE_SET(mapped, cr);*/
7103
7104     return target;
7105 }
7106
7107 static VALUE
7108 rb_str_ascii_casemap(VALUE source, VALUE target, OnigCaseFoldType *flags, rb_encoding *enc)
7109 {
7110     const OnigUChar *source_current, *source_end;
7111     OnigUChar *target_current, *target_end;
7112     long old_length = RSTRING_LEN(source);
7113     int length_or_invalid;
7114
7115     if (old_length == 0) return Qnil;
7116
7117     source_current = (OnigUChar*)RSTRING_PTR(source);
7118     source_end = (OnigUChar*)RSTRING_END(source);
7119     if (source == target) {
7120         target_current = (OnigUChar*)source_current;
7121         target_end = (OnigUChar*)source_end;
7122     }
7123     else {
7124         target_current = (OnigUChar*)RSTRING_PTR(target);
7125         target_end = (OnigUChar*)RSTRING_END(target);
7126     }
7127
7128     length_or_invalid = onigenc_ascii_only_case_map(flags,
7129                                &source_current, source_end,
7130                                target_current, target_end, enc);
7131     if (length_or_invalid < 0)
7132         rb_raise(rb_eArgError, "input string invalid");
7133     if (CASEMAP_DEBUG && length_or_invalid != old_length) {
7134         fprintf(stderr, "problem with rb_str_ascii_casemap"
7135                 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7136         rb_raise(rb_eArgError, "internal problem with rb_str_ascii_casemap"
7137                  "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7138     }
7139
7140     str_enc_copy(target, source);
7141
7142     return target;
7143 }
7144
7145 static bool
7146 upcase_single(VALUE str)
7147 {
7148     char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
7149     bool modified = false;
7150
7151     while (s < send) {
7152         unsigned int c = *(unsigned char*)s;
7153
7154         if ('a' <= c && c <= 'z') {
7155             *s = 'A' + (c - 'a');
7156             modified = true;
7157         }
7158         s++;
7159     }
7160     return modified;
7161 }
7162
7163 /*
7164  *  call-seq:
7165  *    upcase!(*options) -> self or nil
7166  *
7167  *  Upcases the characters in +self+;
7168  *  returns +self+ if any changes were made, +nil+ otherwise:
7169  *
7170  *    s = 'Hello World!' # => "Hello World!"
7171  *    s.upcase!          # => "HELLO WORLD!"
7172  *    s                  # => "HELLO WORLD!"
7173  *    s.upcase!          # => nil
7174  *
7175  *  The casing may be affected by the given +options+;
7176  *  see {Case Mapping}[doc/case_mapping_rdoc.html].
7177  *
7178  *  Related: String#upcase, String#downcase, String#downcase!.
7179  *
7180  */
7181
7182 static VALUE
7183 rb_str_upcase_bang(int argc, VALUE *argv, VALUE str)
7184 {
7185     rb_encoding *enc;
7186     OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7187
7188     flags = check_case_options(argc, argv, flags);
7189     str_modify_keep_cr(str);
7190     enc = str_true_enc(str);
7191     if (case_option_single_p(flags, enc, str)) {
7192         if (upcase_single(str))
7193             flags |= ONIGENC_CASE_MODIFIED;
7194     }
7195     else if (flags&ONIGENC_CASE_ASCII_ONLY)
7196         rb_str_ascii_casemap(str, str, &flags, enc);
7197     else
7198         str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7199
7200     if (ONIGENC_CASE_MODIFIED&flags) return str;
7201     return Qnil;
7202 }
7203
7204
7205 /*
7206  *  call-seq:
7207  *    upcase(*options) -> string
7208  *
7209  *  Returns a string containing the upcased characters in +self+:
7210  *
7211  *     s = 'Hello World!' # => "Hello World!"
7212  *     s.upcase           # => "HELLO WORLD!"
7213  *
7214  *  The casing may be affected by the given +options+;
7215  *  see {Case Mapping}[doc/case_mapping_rdoc.html].
7216  *
7217  *  Related: String#upcase!, String#downcase, String#downcase!.
7218  *
7219  */
7220
7221 static VALUE
7222 rb_str_upcase(int argc, VALUE *argv, VALUE str)
7223 {
7224     rb_encoding *enc;
7225     OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7226     VALUE ret;
7227
7228     flags = check_case_options(argc, argv, flags);
7229     enc = str_true_enc(str);
7230     if (case_option_single_p(flags, enc, str)) {
7231         ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
7232         str_enc_copy(ret, str);
7233         upcase_single(ret);
7234     }
7235     else if (flags&ONIGENC_CASE_ASCII_ONLY) {
7236         ret = rb_str_new(0, RSTRING_LEN(str));
7237         rb_str_ascii_casemap(str, ret, &flags, enc);
7238     }
7239     else {
7240         ret = rb_str_casemap(str, &flags, enc);
7241     }
7242
7243     return ret;
7244 }
7245
7246 static bool
7247 downcase_single(VALUE str)
7248 {
7249     char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
7250     bool modified = false;
7251
7252     while (s < send) {
7253         unsigned int c = *(unsigned char*)s;
7254
7255         if ('A' <= c && c <= 'Z') {
7256             *s = 'a' + (c - 'A');
7257             modified = true;
7258         }
7259         s++;
7260     }
7261
7262     return modified;
7263 }
7264
7265 /*
7266  *  call-seq:
7267  *    downcase!(*options) -> self or nil
7268  *
7269  *  Downcases the characters in +self+;
7270  *  returns +self+ if any changes were made, +nil+ otherwise:
7271  *
7272  *    s = 'Hello World!' # => "Hello World!"
7273  *    s.downcase!        # => "hello world!"
7274  *    s                  # => "hello world!"
7275  *    s.downcase!        # => nil
7276  *
7277  *  The casing may be affected by the given +options+;
7278  *  see {Case Mapping}[doc/case_mapping_rdoc.html].
7279  *
7280  *  Related: String#downcase, String#upcase, String#upcase!.
7281  *
7282  */
7283
7284 static VALUE
7285 rb_str_downcase_bang(int argc, VALUE *argv, VALUE str)
7286 {
7287     rb_encoding *enc;
7288     OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
7289
7290     flags = check_case_options(argc, argv, flags);
7291     str_modify_keep_cr(str);
7292     enc = str_true_enc(str);
7293     if (case_option_single_p(flags, enc, str)) {
7294         if (downcase_single(str))
7295             flags |= ONIGENC_CASE_MODIFIED;
7296     }
7297     else if (flags&ONIGENC_CASE_ASCII_ONLY)
7298         rb_str_ascii_casemap(str, str, &flags, enc);
7299     else
7300         str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7301
7302     if (ONIGENC_CASE_MODIFIED&flags) return str;
7303     return Qnil;
7304 }
7305
7306
7307 /*
7308  *  call-seq:
7309  *    downcase(*options) -> string
7310  *
7311  *  Returns a string containing the downcased characters in +self+:
7312  *
7313  *     s = 'Hello World!' # => "Hello World!"
7314  *     s.downcase         # => "hello world!"
7315  *
7316  *  The casing may be affected by the given +options+;
7317  *  see {Case Mapping}[doc/case_mapping_rdoc.html].
7318  *
7319  *  Related: String#downcase!, String#upcase, String#upcase!.
7320  *
7321  */
7322
7323 static VALUE
7324 rb_str_downcase(int argc, VALUE *argv, VALUE str)
7325 {
7326     rb_encoding *enc;
7327     OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
7328     VALUE ret;
7329
7330     flags = check_case_options(argc, argv, flags);
7331     enc = str_true_enc(str);
7332     if (case_option_single_p(flags, enc, str)) {
7333         ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
7334         str_enc_copy(ret, str);
7335         downcase_single(ret);
7336     }
7337     else if (flags&ONIGENC_CASE_ASCII_ONLY) {
7338         ret = rb_str_new(0, RSTRING_LEN(str));
7339         rb_str_ascii_casemap(str, ret, &flags, enc);
7340     }
7341     else {
7342         ret = rb_str_casemap(str, &flags, enc);
7343     }
7344
7345     return ret;
7346 }
7347
7348
7349 /*
7350  *  call-seq:
7351  *    capitalize!(*options) -> self or nil
7352  *
7353  *  Upcases the first character in +self+;
7354  *  downcases the remaining characters;
7355  *  returns +self+ if any changes were made, +nil+ otherwise:
7356  *
7357  *    s = 'hello World!' # => "hello World!"
7358  *    s.capitalize!      # => "Hello world!"
7359  *    s                  # => "Hello world!"
7360  *    s.capitalize!      # => nil
7361  *
7362  *  The casing may be affected by the given +options+;
7363  *  see {Case Mapping}[doc/case_mapping_rdoc.html].
7364  *
7365  *  Related: String#capitalize.
7366  *
7367  */
7368
7369 static VALUE
7370 rb_str_capitalize_bang(int argc, VALUE *argv, VALUE str)
7371 {
7372     rb_encoding *enc;
7373     OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
7374
7375     flags = check_case_options(argc, argv, flags);
7376     str_modify_keep_cr(str);
7377     enc = str_true_enc(str);
7378     if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
7379     if (flags&ONIGENC_CASE_ASCII_ONLY)
7380         rb_str_ascii_casemap(str, str, &flags, enc);
7381     else
7382         str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7383
7384     if (ONIGENC_CASE_MODIFIED&flags) return str;
7385     return Qnil;
7386 }
7387
7388
7389 /*
7390  *  call-seq:
7391  *    capitalize(*options) -> string
7392  *
7393  *  Returns a string containing the characters in +self+;
7394  *  the first character is upcased;
7395  *  the remaining characters are downcased:
7396  *
7397  *     s = 'hello World!' # => "hello World!"
7398  *     s.capitalize       # => "Hello world!"
7399  *
7400  *  The casing may be affected by the given +options+;
7401  *  see {Case Mapping}[doc/case_mapping_rdoc.html].
7402  *
7403  *  Related: String#capitalize!.
7404  *
7405  */
7406
7407 static VALUE
7408 rb_str_capitalize(int argc, VALUE *argv, VALUE str)
7409 {
7410     rb_encoding *enc;
7411     OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
7412     VALUE ret;
7413
7414     flags = check_case_options(argc, argv, flags);
7415     enc = str_true_enc(str);
7416     if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str;
7417     if (flags&ONIGENC_CASE_ASCII_ONLY) {
7418         ret = rb_str_new(0, RSTRING_LEN(str));
7419         rb_str_ascii_casemap(str, ret, &flags, enc);
7420     }
7421     else {
7422         ret = rb_str_casemap(str, &flags, enc);
7423     }
7424     return ret;
7425 }
7426
7427
7428 /*
7429  *  call-seq:
7430  *    swapcase!(*options) -> self or nil
7431  *
7432  *  Upcases each lowercase character in +self+;
7433  *  downcases uppercase character;
7434  *  returns +self+ if any changes were made, +nil+ otherwise:
7435  *
7436  *    s = 'Hello World!' # => "Hello World!"
7437  *    s.swapcase!        # => "hELLO wORLD!"
7438  *    s                  # => "Hello World!"
7439  *    ''.swapcase!       # => nil
7440  *
7441  *  The casing may be affected by the given +options+;
7442  *  see {Case Mapping}[doc/case_mapping_rdoc.html].
7443  *
7444  *  Related: String#swapcase.
7445  *
7446  */
7447
7448 static VALUE
7449 rb_str_swapcase_bang(int argc, VALUE *argv, VALUE str)
7450 {
7451     rb_encoding *enc;
7452     OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
7453
7454     flags = check_case_options(argc, argv, flags);
7455     str_modify_keep_cr(str);
7456     enc = str_true_enc(str);
7457     if (flags&ONIGENC_CASE_ASCII_ONLY)
7458         rb_str_ascii_casemap(str, str, &flags, enc);
7459     else
7460         str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7461
7462     if (ONIGENC_CASE_MODIFIED&flags) return str;
7463     return Qnil;
7464 }
7465
7466
7467 /*
7468  *  call-seq:
7469  *    swapcase(*options) -> string
7470  *
7471  *  Returns a string containing the characters in +self+, with cases reversed;
7472  *  each uppercase character is downcased;
7473  *  each lowercase character is upcased:
7474  *
7475  *     s = 'Hello World!' # => "Hello World!"
7476  *     s.swapcase         # => "hELLO wORLD!"
7477  *
7478  *  The casing may be affected by the given +options+;
7479  *  see {Case Mapping}[doc/case_mapping_rdoc.html].
7480  *
7481  *  Related: String#swapcase!.
7482  *
7483  */
7484
7485 static VALUE
7486 rb_str_swapcase(int argc, VALUE *argv, VALUE str)
7487 {
7488     rb_encoding *enc;
7489     OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
7490     VALUE ret;
7491
7492     flags = check_case_options(argc, argv, flags);
7493     enc = str_true_enc(str);
7494     if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str_duplicate(rb_cString, str);
7495     if (flags&ONIGENC_CASE_ASCII_ONLY) {
7496         ret = rb_str_new(0, RSTRING_LEN(str));
7497         rb_str_ascii_casemap(str, ret, &flags, enc);
7498     }
7499     else {
7500         ret = rb_str_casemap(str, &flags, enc);
7501     }
7502     return ret;
7503 }
7504
7505 typedef unsigned char *USTR;
7506
7507 struct tr {
7508     int gen;
7509     unsigned int now, max;
7510     char *p, *pend;
7511 };
7512
7513 static unsigned int
7514 trnext(struct tr *t, rb_encoding *enc)
7515 {
7516     int n;
7517
7518     for (;;) {
7519       nextpart:
7520         if (!t->gen) {
7521             if (t->p == t->pend) return -1;
7522             if (rb_enc_ascget(t->p, t->pend, &n, enc) == '\\' && t->p + n < t->pend) {
7523                 t->p += n;
7524             }
7525             t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
7526             t->p += n;
7527             if (rb_enc_ascget(t->p, t->pend, &n, enc) == '-' && t->p + n < t->pend) {
7528                 t->p += n;
7529                 if (t->p < t->pend) {
7530                     unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
7531                     t->p += n;
7532                     if (t->now > c) {
7533                         if (t->now < 0x80 && c < 0x80) {
7534                             rb_raise(rb_eArgError,
7535                                      "invalid range \"%c-%c\" in string transliteration",
7536                                      t->now, c);
7537                         }
7538                         else {
7539                             rb_raise(rb_eArgError, "invalid range in string transliteration");
7540                         }
7541                         continue; /* not reached */
7542                     }
7543                     t->gen = 1;
7544                     t->max = c;
7545                 }
7546             }
7547             return t->now;
7548         }
7549         else {
7550             while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) {
7551                 if (t->now == t->max) {
7552                     t->gen = 0;
7553                     goto nextpart;
7554                 }
7555             }
7556             if (t->now < t->max) {
7557                 return t->now;
7558             }
7559             else {
7560                 t->gen = 0;
7561                 return t->max;
7562             }
7563         }
7564     }
7565 }
7566
7567 static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
7568
7569 static VALUE
7570 tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
7571 {
7572     const unsigned int errc = -1;
7573     unsigned int trans[256];
7574     rb_encoding *enc, *e1, *e2;
7575     struct tr trsrc, trrepl;
7576     int cflag = 0;
7577     unsigned int c, c0, last = 0;
7578     int modify = 0, i, l;
7579     unsigned char *s, *send;
7580     VALUE hash = 0;
7581     int singlebyte = single_byte_optimizable(str);
7582     int termlen;
7583     int cr;
7584
7585 #define CHECK_IF_ASCII(c) \
7586     (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
7587            (cr = ENC_CODERANGE_VALID) : 0)
7588
7589     StringValue(src);
7590     StringValue(repl);
7591     if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
7592     if (RSTRING_LEN(repl) == 0) {
7593         return rb_str_delete_bang(1, &src, str);
7594     }
7595
7596     cr = ENC_CODERANGE(str);
7597     e1 = rb_enc_check(str, src);
7598     e2 = rb_enc_check(str, repl);
7599     if (e1 == e2) {
7600         enc = e1;
7601     }
7602     else {
7603         enc = rb_enc_check(src, repl);
7604     }
7605     trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
7606     if (RSTRING_LEN(src) > 1 &&
7607         rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
7608         trsrc.p + l < trsrc.pend) {
7609         cflag = 1;
7610         trsrc.p += l;
7611     }
7612     trrepl.p = RSTRING_PTR(repl);
7613     trrepl.pend = trrepl.p + RSTRING_LEN(repl);
7614     trsrc.gen = trrepl.gen = 0;
7615     trsrc.now = trrepl.now = 0;
7616     trsrc.max = trrepl.max = 0;
7617
7618     if (cflag) {
7619         for (i=0; i<256; i++) {
7620             trans[i] = 1;
7621         }
7622         while ((c = trnext(&trsrc, enc)) != errc) {
7623             if (c < 256) {
7624                 trans[c] = errc;
7625             }
7626             else {
7627                 if (!hash) hash = rb_hash_new();
7628                 rb_hash_aset(hash, UINT2NUM(c), Qtrue);
7629             }
7630         }
7631         while ((c = trnext(&trrepl, enc)) != errc)
7632             /* retrieve last replacer */;
7633         last = trrepl.now;
7634         for (i=0; i<256; i++) {
7635             if (trans[i] != errc) {
7636                 trans[i] = last;
7637             }
7638         }
7639     }
7640     else {
7641         unsigned int r;
7642
7643         for (i=0; i<256; i++) {
7644             trans[i] = errc;
7645         }
7646         while ((c = trnext(&trsrc, enc)) != errc) {
7647             r = trnext(&trrepl, enc);
7648             if (r == errc) r = trrepl.now;
7649             if (c < 256) {
7650                 trans[c] = r;
7651                 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
7652             }
7653             else {
7654                 if (!hash) hash = rb_hash_new();
7655                 rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
7656             }
7657         }
7658     }
7659
7660     if (cr == ENC_CODERANGE_VALID && rb_enc_asciicompat(e1))
7661         cr = ENC_CODERANGE_7BIT;
7662     str_modify_keep_cr(str);
7663     s = (unsigned char *)RSTRING_PTR(str); send = (unsigned char *)RSTRING_END(str);
7664     termlen = rb_enc_mbminlen(enc);
7665     if (sflag) {
7666         int clen, tlen;
7667         long offset, max = RSTRING_LEN(str);
7668         unsigned int save = -1;
7669         unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
7670
7671         while (s < send) {
7672             int may_modify = 0;
7673
7674             c0 = c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, e1);
7675             tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
7676
7677             s += clen;
7678             if (c < 256) {
7679                 c = trans[c];
7680             }
7681             else if (hash) {
7682                 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
7683                 if (NIL_P(tmp)) {
7684                     if (cflag) c = last;
7685                     else c = errc;
7686                 }
7687                 else if (cflag) c = errc;
7688                 else c = NUM2INT(tmp);
7689             }
7690             else {
7691                 c = errc;
7692             }
7693             if (c != (unsigned int)-1) {
7694                 if (save == c) {
7695                     CHECK_IF_ASCII(c);
7696                     continue;
7697                 }
7698                 save = c;
7699                 tlen = rb_enc_codelen(c, enc);
7700                 modify = 1;
7701             }
7702             else {
7703                 save = -1;
7704                 c = c0;
7705                 if (enc != e1) may_modify = 1;
7706             }
7707             if ((offset = t - buf) + tlen > max) {
7708                 size_t MAYBE_UNUSED(old) = max + termlen;
7709                 max = offset + tlen + (send - s);
7710                 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
7711                 t = buf + offset;
7712             }
7713             rb_enc_mbcput(c, t, enc);
7714             if (may_modify && memcmp(s, t, tlen) != 0) {
7715                 modify = 1;
7716             }
7717             CHECK_IF_ASCII(c);
7718             t += tlen;
7719         }
7720         if (!STR_EMBED_P(str)) {
7721             ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
7722         }
7723         TERM_FILL((char *)t, termlen);
7724         RSTRING(str)->as.heap.ptr = (char *)buf;
7725         RSTRING(str)->as.heap.len = t - buf;
7726         STR_SET_NOEMBED(str);
7727         RSTRING(str)->as.heap.aux.capa = max;
7728     }
7729     else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
7730         while (s < send) {
7731             c = (unsigned char)*s;
7732             if (trans[c] != errc) {
7733                 if (!cflag) {
7734                     c = trans[c];
7735                     *s = c;
7736                     modify = 1;
7737                 }
7738                 else {
7739                     *s = last;
7740                     modify = 1;
7741                 }
7742             }
7743             CHECK_IF_ASCII(c);
7744             s++;
7745         }
7746     }
7747     else {
7748         int clen, tlen;
7749         long offset, max = (long)((send - s) * 1.2);
7750         unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
7751
7752         while (s < send) {
7753             int may_modify = 0;
7754             c0 = c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, e1);
7755             tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
7756
7757             if (c < 256) {
7758                 c = trans[c];
7759             }
7760             else if (hash) {
7761                 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
7762                 if (NIL_P(tmp)) {
7763                     if (cflag) c = last;
7764                     else c = errc;
7765                 }
7766                 else if (cflag) c = errc;
7767                 else c = NUM2INT(tmp);
7768             }
7769             else {
7770                 c = cflag ? last : errc;
7771             }
7772             if (c != errc) {
7773                 tlen = rb_enc_codelen(c, enc);
7774                 modify = 1;
7775             }
7776             else {
7777                 c = c0;
7778                 if (enc != e1) may_modify = 1;
7779             }
7780             if ((offset = t - buf) + tlen > max) {
7781                 size_t MAYBE_UNUSED(old) = max + termlen;
7782                 max = offset + tlen + (long)((send - s) * 1.2);
7783                 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
7784                 t = buf + offset;
7785             }
7786             if (s != t) {
7787                 rb_enc_mbcput(c, t, enc);
7788                 if (may_modify && memcmp(s, t, tlen) != 0) {
7789                     modify = 1;
7790                 }
7791             }
7792             CHECK_IF_ASCII(c);
7793             s += clen;
7794             t += tlen;
7795         }
7796         if (!STR_EMBED_P(str)) {
7797             ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
7798         }
7799         TERM_FILL((char *)t, termlen);
7800         RSTRING(str)->as.heap.ptr = (char *)buf;
7801         RSTRING(str)->as.heap.len = t - buf;
7802         STR_SET_NOEMBED(str);
7803         RSTRING(str)->as.heap.aux.capa = max;
7804     }
7805
7806     if (modify) {
7807         if (cr != ENC_CODERANGE_BROKEN)
7808             ENC_CODERANGE_SET(str, cr);
7809         rb_enc_associate(str, enc);
7810         return str;
7811     }
7812     return Qnil;
7813 }
7814
7815
7816 /*
7817  *  call-seq:
7818  *     str.tr!(from_str, to_str)   -> str or nil
7819  *
7820  *  Translates <i>str</i> in place, using the same rules as
7821  *  String#tr. Returns <i>str</i>, or <code>nil</code> if no changes
7822  *  were made.
7823  */
7824
7825 static VALUE
7826 rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
7827 {
7828     return tr_trans(str, src, repl, 0);
7829 }
7830
7831
7832 /*
7833  *  call-seq:
7834  *     str.tr(from_str, to_str)   => new_str
7835  *
7836  *  Returns a copy of +str+ with the characters in +from_str+ replaced by the
7837  *  corresponding characters in +to_str+.  If +to_str+ is shorter than
7838  *  +from_str+, it is padded with its last character in order to maintain the
7839  *  correspondence.
7840  *
7841  *     "hello".tr('el', 'ip')      #=> "hippo"
7842  *     "hello".tr('aeiou', '*')    #=> "h*ll*"
7843  *     "hello".tr('aeiou', 'AA*')  #=> "hAll*"
7844  *
7845  *  Both strings may use the <code>c1-c2</code> notation to denote ranges of
7846  *  characters, and +from_str+ may start with a <code>^</code>, which denotes
7847  *  all characters except those listed.
7848  *
7849  *     "hello".tr('a-y', 'b-z')    #=> "ifmmp"
7850  *     "hello".tr('^aeiou', '*')   #=> "*e**o"
7851  *
7852  *  The backslash character <code>\\</code> can be used to escape
7853  *  <code>^</code> or <code>-</code> and is otherwise ignored unless it
7854  *  appears at the end of a range or the end of the +from_str+ or +to_str+:
7855  *
7856  *     "hello^world".tr("\\^aeiou", "*") #=> "h*ll**w*rld"
7857  *     "hello-world".tr("a\\-eo", "*")   #=> "h*ll**w*rld"
7858  *
7859  *     "hello\r\nworld".tr("\r", "")   #=> "hello\nworld"
7860  *     "hello\r\nworld".tr("\\r", "")  #=> "hello\r\nwold"
7861  *     "hello\r\nworld".tr("\\\r", "") #=> "hello\nworld"
7862  *
7863  *     "X['\\b']".tr("X\\", "")   #=> "['b']"
7864  *     "X['\\b']".tr("X-\\]", "") #=> "'b'"
7865  */
7866
7867 static VALUE
7868 rb_str_tr(VALUE str, VALUE src, VALUE repl)
7869 {
7870     str = str_duplicate(rb_cString, str);
7871     tr_trans(str, src, repl, 0);
7872     return str;
7873 }
7874
7875 #define TR_TABLE_MAX (UCHAR_MAX+1)
7876 #define TR_TABLE_SIZE (TR_TABLE_MAX+1)
7877 static void
7878 tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first,
7879                VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
7880 {
7881     const unsigned int errc = -1;
7882     char buf[TR_TABLE_MAX];
7883     struct tr tr;
7884     unsigned int c;
7885     VALUE table = 0, ptable = 0;
7886     int i, l, cflag = 0;
7887
7888     tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
7889     tr.gen = tr.now = tr.max = 0;
7890
7891     if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
7892         cflag = 1;
7893         tr.p += l;
7894     }
7895     if (first) {
7896         for (i=0; i<TR_TABLE_MAX; i++) {
7897             stable[i] = 1;
7898         }
7899         stable[TR_TABLE_MAX] = cflag;
7900     }
7901     else if (stable[TR_TABLE_MAX] && !cflag) {
7902         stable[TR_TABLE_MAX] = 0;
7903     }
7904     for (i=0; i<TR_TABLE_MAX; i++) {
7905         buf[i] = cflag;
7906     }
7907
7908     while ((c = trnext(&tr, enc)) != errc) {
7909         if (c < TR_TABLE_MAX) {
7910             buf[(unsigned char)c] = !cflag;
7911         }
7912         else {
7913             VALUE key = UINT2NUM(c);
7914
7915             if (!table && (first || *tablep || stable[TR_TABLE_MAX])) {
7916                 if (cflag) {
7917                     ptable = *ctablep;
7918                     table = ptable ? ptable : rb_hash_new();
7919                     *ctablep = table;
7920                 }
7921                 else {
7922                     table = rb_hash_new();
7923                     ptable = *tablep;
7924                     *tablep = table;
7925                 }
7926             }
7927             if (table && (!ptable || (cflag ^ !NIL_P(rb_hash_aref(ptable, key))))) {
7928                 rb_hash_aset(table, key, Qtrue);
7929             }
7930         }
7931     }
7932     for (i=0; i<TR_TABLE_MAX; i++) {
7933         stable[i] = stable[i] && buf[i];
7934     }
7935     if (!table && !cflag) {
7936         *tablep = 0;
7937     }
7938 }
7939
7940
7941 static int
7942 tr_find(unsigned int c, const char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
7943 {
7944     if (c < TR_TABLE_MAX) {
7945         return table[c] != 0;
7946     }
7947     else {
7948         VALUE v = UINT2NUM(c);
7949
7950         if (del) {
7951             if (!NIL_P(rb_hash_lookup(del, v)) &&
7952                     (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) {
7953                 return TRUE;
7954             }
7955         }
7956         else if (nodel && !NIL_P(rb_hash_lookup(nodel, v))) {
7957             return FALSE;
7958         }
7959         return table[TR_TABLE_MAX] ? TRUE : FALSE;
7960     }
7961 }
7962
7963 /*
7964  *  call-seq:
7965  *     str.delete!([other_str]+)   -> str or nil
7966  *
7967  *  Performs a <code>delete</code> operation in place, returning <i>str</i>, or
7968  *  <code>nil</code> if <i>str</i> was not modified.
7969  */
7970
7971 static VALUE
7972 rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
7973 {
7974     char squeez[TR_TABLE_SIZE];
7975     rb_encoding *enc = 0;
7976     char *s, *send, *t;
7977     VALUE del = 0, nodel = 0;
7978     int modify = 0;
7979     int i, ascompat, cr;
7980
7981     if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
7982     rb_check_arity(argc, 1, UNLIMITED_ARGUMENTS);
7983     for (i=0; i<argc; i++) {
7984         VALUE s = argv[i];
7985
7986         StringValue(s);
7987         enc = rb_enc_check(str, s);
7988         tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
7989     }
7990
7991     str_modify_keep_cr(str);
7992     ascompat = rb_enc_asciicompat(enc);
7993     s = t = RSTRING_PTR(str);
7994     send = RSTRING_END(str);
7995     cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
7996     while (s < send) {
7997         unsigned int c;
7998         int clen;
7999
8000         if (ascompat && (c = *(unsigned char*)s) < 0x80) {
8001             if (squeez[c]) {
8002                 modify = 1;
8003             }
8004             else {
8005                 if (t != s) *t = c;
8006                 t++;
8007             }
8008             s++;
8009         }
8010         else {
8011             c = rb_enc_codepoint_len(s, send, &clen, enc);
8012
8013             if (tr_find(c, squeez, del, nodel)) {
8014                 modify = 1;
8015             }
8016             else {
8017                 if (t != s) rb_enc_mbcput(c, t, enc);
8018                 t += clen;
8019                 if (cr == ENC_CODERANGE_7BIT) cr = ENC_CODERANGE_VALID;
8020             }
8021             s += clen;
8022         }
8023     }
8024     TERM_FILL(t, TERM_LEN(str));
8025     STR_SET_LEN(str, t - RSTRING_PTR(str));
8026     ENC_CODERANGE_SET(str, cr);
8027
8028     if (modify) return str;
8029     return Qnil;
8030 }
8031
8032
8033 /*
8034  *  call-seq:
8035  *     str.delete([other_str]+)   -> new_str
8036  *
8037  *  Returns a copy of <i>str</i> with all characters in the intersection of its
8038  *  arguments deleted. Uses the same rules for building the set of characters as
8039  *  String#count.
8040  *
8041  *     "hello".delete "l","lo"        #=> "heo"
8042  *     "hello".delete "lo"            #=> "he"
8043  *     "hello".delete "aeiou", "^e"   #=> "hell"
8044  *     "hello".delete "ej-m"          #=> "ho"
8045  */
8046
8047 static VALUE
8048 rb_str_delete(int argc, VALUE *argv, VALUE str)
8049 {
8050     str = str_duplicate(rb_cString, str);
8051     rb_str_delete_bang(argc, argv, str);
8052     return str;
8053 }
8054
8055
8056 /*
8057  *  call-seq:
8058  *     str.squeeze!([other_str]*)   -> str or nil
8059  *
8060  *  Squeezes <i>str</i> in place, returning either <i>str</i>, or
8061  *  <code>nil</code> if no changes were made.
8062  */
8063
8064 static VALUE
8065 rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
8066 {
8067     char squeez[TR_TABLE_SIZE];
8068     rb_encoding *enc = 0;
8069     VALUE del = 0, nodel = 0;
8070     unsigned char *s, *send, *t;
8071     int i, modify = 0;
8072     int ascompat, singlebyte = single_byte_optimizable(str);
8073     unsigned int save;
8074
8075     if (argc == 0) {
8076         enc = STR_ENC_GET(str);
8077     }
8078     else {
8079         for (i=0; i<argc; i++) {
8080             VALUE s = argv[i];
8081
8082             StringValue(s);
8083             enc = rb_enc_check(str, s);
8084             if (singlebyte && !single_byte_optimizable(s))
8085                 singlebyte = 0;
8086             tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8087         }
8088     }
8089
8090     str_modify_keep_cr(str);
8091     s = t = (unsigned char *)RSTRING_PTR(str);
8092     if (!s || RSTRING_LEN(str) == 0) return Qnil;
8093     send = (unsigned char *)RSTRING_END(str);
8094     save = -1;
8095     ascompat = rb_enc_asciicompat(enc);
8096
8097     if (singlebyte) {
8098         while (s < send) {
8099             unsigned int c = *s++;
8100             if (c != save || (argc > 0 && !squeez[c])) {
8101                 *t++ = save = c;
8102             }
8103         }
8104     }
8105     else {
8106         while (s < send) {
8107             unsigned int c;
8108             int clen;
8109
8110             if (ascompat && (c = *s) < 0x80) {
8111                 if (c != save || (argc > 0 && !squeez[c])) {
8112                     *t++ = save = c;
8113                 }
8114                 s++;
8115             }
8116             else {
8117                 c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, enc);
8118
8119                 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
8120                     if (t != s) rb_enc_mbcput(c, t, enc);
8121                     save = c;
8122                     t += clen;
8123                 }
8124                 s += clen;
8125             }
8126         }
8127     }
8128
8129     TERM_FILL((char *)t, TERM_LEN(str));
8130     if ((char *)t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
8131         STR_SET_LEN(str, (char *)t - RSTRING_PTR(str));
8132         modify = 1;
8133     }
8134
8135     if (modify) return str;
8136     return Qnil;
8137 }
8138
8139
8140 /*
8141  *  call-seq:
8142  *     str.squeeze([other_str]*)    -> new_str
8143  *
8144  *  Builds a set of characters from the <i>other_str</i> parameter(s)
8145  *  using the procedure described for String#count. Returns a new
8146  *  string where runs of the same character that occur in this set are
8147  *  replaced by a single character. If no arguments are given, all
8148  *  runs of identical characters are replaced by a single character.
8149  *
8150  *     "yellow moon".squeeze                  #=> "yelow mon"
8151  *     "  now   is  the".squeeze(" ")         #=> " now is the"
8152  *     "putters shoot balls".squeeze("m-z")   #=> "puters shot balls"
8153  */
8154
8155 static VALUE
8156 rb_str_squeeze(int argc, VALUE *argv, VALUE str)
8157 {
8158     str = str_duplicate(rb_cString, str);
8159     rb_str_squeeze_bang(argc, argv, str);
8160     return str;
8161 }
8162
8163
8164 /*
8165  *  call-seq:
8166  *     str.tr_s!(from_str, to_str)   -> str or nil
8167  *
8168  *  Performs String#tr_s processing on <i>str</i> in place,
8169  *  returning <i>str</i>, or <code>nil</code> if no changes were made.
8170  */
8171
8172 static VALUE
8173 rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
8174 {
8175     return tr_trans(str, src, repl, 1);
8176 }
8177
8178
8179 /*
8180  *  call-seq:
8181  *     str.tr_s(from_str, to_str)   -> new_str
8182  *
8183  *  Processes a copy of <i>str</i> as described under String#tr, then
8184  *  removes duplicate characters in regions that were affected by the
8185  *  translation.
8186  *
8187  *     "hello".tr_s('l', 'r')     #=> "hero"
8188  *     "hello".tr_s('el', '*')    #=> "h*o"
8189  *     "hello".tr_s('el', 'hx')   #=> "hhxo"
8190  */
8191
8192 static VALUE
8193 rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
8194 {
8195     str = str_duplicate(rb_cString, str);
8196     tr_trans(str, src, repl, 1);
8197     return str;
8198 }
8199
8200
8201 /*
8202  *  call-seq:
8203  *     str.count([other_str]+)   -> integer
8204  *
8205  *  Each +other_str+ parameter defines a set of characters to count.  The
8206  *  intersection of these sets defines the characters to count in +str+.  Any
8207  *  +other_str+ that starts with a caret <code>^</code> is negated.  The
8208  *  sequence <code>c1-c2</code> means all characters between c1 and c2.  The
8209  *  backslash character <code>\\</code> can be used to escape <code>^</code> or
8210  *  <code>-</code> and is otherwise ignored unless it appears at the end of a
8211  *  sequence or the end of a +other_str+.
8212  *
8213  *     a = "hello world"
8214  *     a.count "lo"                   #=> 5
8215  *     a.count "lo", "o"              #=> 2
8216  *     a.count "hello", "^l"          #=> 4
8217  *     a.count "ej-m"                 #=> 4
8218  *
8219  *     "hello^world".count "\\^aeiou" #=> 4
8220  *     "hello-world".count "a\\-eo"   #=> 4
8221  *
8222  *     c = "hello world\\r\\n"
8223  *     c.count "\\"                   #=> 2
8224  *     c.count "\\A"                  #=> 0
8225  *     c.count "X-\\w"                #=> 3
8226  */
8227
8228 static VALUE
8229 rb_str_count(int argc, VALUE *argv, VALUE str)
8230 {
8231     char table[TR_TABLE_SIZE];
8232     rb_encoding *enc = 0;
8233     VALUE del = 0, nodel = 0, tstr;
8234     char *s, *send;
8235     int i;
8236     int ascompat;
8237     size_t n = 0;
8238
8239     rb_check_arity(argc, 1, UNLIMITED_ARGUMENTS);
8240
8241     tstr = argv[0];
8242     StringValue(tstr);
8243     enc = rb_enc_check(str, tstr);
8244     if (argc == 1) {
8245         const char *ptstr;
8246         if (RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
8247             (ptstr = RSTRING_PTR(tstr),
8248              ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc, (const unsigned char *)ptstr, (const unsigned char *)ptstr+1)) &&
8249             !is_broken_string(str)) {
8250             int clen;
8251             unsigned char c = rb_enc_codepoint_len(ptstr, ptstr+1, &clen, enc);
8252
8253             s = RSTRING_PTR(str);
8254             if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
8255             send = RSTRING_END(str);
8256             while (s < send) {
8257                 if (*(unsigned char*)s++ == c) n++;
8258             }
8259             return SIZET2NUM(n);
8260         }
8261     }
8262
8263     tr_setup_table(tstr, table, TRUE, &del, &nodel, enc);
8264     for (i=1; i<argc; i++) {
8265         tstr = argv[i];
8266         StringValue(tstr);
8267         enc = rb_enc_check(str, tstr);
8268         tr_setup_table(tstr, table, FALSE, &del, &nodel, enc);
8269     }
8270
8271     s = RSTRING_PTR(str);
8272     if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
8273     send = RSTRING_END(str);
8274     ascompat = rb_enc_asciicompat(enc);
8275     while (s < send) {
8276         unsigned int c;
8277
8278         if (ascompat && (c = *(unsigned char*)s) < 0x80) {
8279             if (table[c]) {
8280                 n++;
8281             }
8282             s++;
8283         }
8284         else {
8285             int clen;
8286             c = rb_enc_codepoint_len(s, send, &clen, enc);
8287             if (tr_find(c, table, del, nodel)) {
8288                 n++;
8289             }
8290             s += clen;
8291         }
8292     }
8293
8294     return SIZET2NUM(n);
8295 }
8296
8297 static VALUE
8298 rb_fs_check(VALUE val)
8299 {
8300     if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING) && !RB_TYPE_P(val, T_REGEXP)) {
8301         val = rb_check_string_type(val);
8302         if (NIL_P(val)) return 0;
8303     }
8304     return val;
8305 }
8306
8307 static const char isspacetable[256] = {
8308     0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
8309     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8310     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8311     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8312     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8313     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8314     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8315     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8316     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8317     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8318     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8319     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8320     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8321     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8322     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8323     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
8324 };
8325
8326 #define ascii_isspace(c) isspacetable[(unsigned char)(c)]
8327
8328 static long
8329 split_string(VALUE result, VALUE str, long beg, long len, long empty_count)
8330 {
8331     if (empty_count >= 0 && len == 0) {
8332         return empty_count + 1;
8333     }
8334     if (empty_count > 0) {
8335         /* make different substrings */
8336         if (result) {
8337             do {
8338                 rb_ary_push(result, str_new_empty_String(str));
8339             } while (--empty_count > 0);
8340         }
8341         else {
8342             do {
8343                 rb_yield(str_new_empty_String(str));
8344             } while (--empty_count > 0);
8345         }
8346     }
8347     str = rb_str_subseq(str, beg, len);
8348     if (result) {
8349         rb_ary_push(result, str);
8350     }
8351     else {
8352         rb_yield(str);
8353     }
8354     return empty_count;
8355 }
8356
8357 typedef enum {
8358     SPLIT_TYPE_AWK, SPLIT_TYPE_STRING, SPLIT_TYPE_REGEXP, SPLIT_TYPE_CHARS
8359 } split_type_t;
8360
8361 static split_type_t
8362 literal_split_pattern(VALUE spat, split_type_t default_type)
8363 {
8364     rb_encoding *enc = STR_ENC_GET(spat);
8365     const char *ptr;
8366     long len;
8367     RSTRING_GETMEM(spat, ptr, len);
8368     if (len == 0) {
8369         /* Special case - split into chars */
8370         return SPLIT_TYPE_CHARS;
8371     }
8372     else if (rb_enc_asciicompat(enc)) {
8373         if (len == 1 && ptr[0] == ' ') {
8374             return SPLIT_TYPE_AWK;
8375         }
8376     }
8377     else {
8378         int l;
8379         if (rb_enc_ascget(ptr, ptr + len, &l, enc) == ' ' && len == l) {
8380             return SPLIT_TYPE_AWK;
8381         }
8382     }
8383     return default_type;
8384 }
8385
8386 /*
8387  *  call-seq:
8388  *     str.split(pattern=nil, [limit])                -> an_array
8389  *     str.split(pattern=nil, [limit]) {|sub| block } -> str
8390  *
8391  *  Divides <i>str</i> into substrings based on a delimiter, returning an array
8392  *  of these substrings.
8393  *
8394  *  If <i>pattern</i> is a String, then its contents are used as
8395  *  the delimiter when splitting <i>str</i>. If <i>pattern</i> is a single
8396  *  space, <i>str</i> is split on whitespace, with leading and trailing
8397  *  whitespace and runs of contiguous whitespace characters ignored.
8398  *
8399  *  If <i>pattern</i> is a Regexp, <i>str</i> is divided where the
8400  *  pattern matches. Whenever the pattern matches a zero-length string,
8401  *  <i>str</i> is split into individual characters. If <i>pattern</i> contains
8402  *  groups, the respective matches will be returned in the array as well.
8403  *
8404  *  If <i>pattern</i> is <code>nil</code>, the value of <code>$;</code> is used.
8405  *  If <code>$;</code> is <code>nil</code> (which is the default), <i>str</i> is
8406  *  split on whitespace as if ' ' were specified.
8407  *
8408  *  If the <i>limit</i> parameter is omitted, trailing null fields are
8409  *  suppressed. If <i>limit</i> is a positive number, at most that number
8410  *  of split substrings will be returned (captured groups will be returned
8411  *  as well, but are not counted towards the limit).
8412  *  If <i>limit</i> is <code>1</code>, the entire
8413  *  string is returned as the only entry in an array. If negative, there is no
8414  *  limit to the number of fields returned, and trailing null fields are not
8415  *  suppressed.
8416  *
8417  *  When the input +str+ is empty an empty Array is returned as the string is
8418  *  considered to have no fields to split.
8419  *
8420  *     " now's  the time ".split       #=> ["now's", "the", "time"]
8421  *     " now's  the time ".split(' ')  #=> ["now's", "the", "time"]
8422  *     " now's  the time".split(/ /)   #=> ["", "now's", "", "the", "time"]
8423  *     "1, 2.34,56, 7".split(%r{,\s*}) #=> ["1", "2.34", "56", "7"]
8424  *     "hello".split(//)               #=> ["h", "e", "l", "l", "o"]
8425  *     "hello".split(//, 3)            #=> ["h", "e", "llo"]
8426  *     "hi mom".split(%r{\s*})         #=> ["h", "i", "m", "o", "m"]
8427  *
8428  *     "mellow yellow".split("ello")   #=> ["m", "w y", "w"]
8429  *     "1,2,,3,4,,".split(',')         #=> ["1", "2", "", "3", "4"]
8430  *     "1,2,,3,4,,".split(',', 4)      #=> ["1", "2", "", "3,4,,"]
8431  *     "1,2,,3,4,,".split(',', -4)     #=> ["1", "2", "", "3", "4", "", ""]
8432  *
8433  *     "1:2:3".split(/(:)()()/, 2)     #=> ["1", ":", "", "", "2:3"]
8434  *
8435  *     "".split(',', -1)               #=> []
8436  *
8437  *  If a block is given, invoke the block with each split substring.
8438  *
8439  */
8440
8441 static VALUE
8442 rb_str_split_m(int argc, VALUE *argv, VALUE str)
8443 {
8444     rb_encoding *enc;
8445     VALUE spat;
8446     VALUE limit;
8447     split_type_t split_type;
8448     long beg, end, i = 0, empty_count = -1;
8449     int lim = 0;
8450     VALUE result, tmp;
8451
8452     result = rb_block_given_p() ? Qfalse : Qnil;
8453     if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
8454         lim = NUM2INT(limit);
8455         if (lim <= 0) limit = Qnil;
8456         else if (lim == 1) {
8457             if (RSTRING_LEN(str) == 0)
8458                 return result ? rb_ary_new2(0) : str;
8459             tmp = str_duplicate(rb_cString, str);
8460             if (!result) {
8461                 rb_yield(tmp);
8462                 return str;
8463             }
8464             return rb_ary_new3(1, tmp);
8465         }
8466         i = 1;
8467     }
8468     if (NIL_P(limit) && !lim) empty_count = 0;
8469
8470     enc = STR_ENC_GET(str);
8471     split_type = SPLIT_TYPE_REGEXP;
8472     if (!NIL_P(spat)) {
8473         spat = get_pat_quoted(spat, 0);
8474     }
8475     else if (NIL_P(spat = rb_fs)) {
8476         split_type = SPLIT_TYPE_AWK;
8477     }
8478     else if (!(spat = rb_fs_check(spat))) {
8479         rb_raise(rb_eTypeError, "value of $; must be String or Regexp");
8480     }
8481     else {
8482         rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$; is set to non-nil value");
8483     }
8484     if (split_type != SPLIT_TYPE_AWK) {
8485         switch (BUILTIN_TYPE(spat)) {
8486           case T_REGEXP:
8487             rb_reg_options(spat); /* check if uninitialized */
8488             tmp = RREGEXP_SRC(spat);
8489             split_type = literal_split_pattern(tmp, SPLIT_TYPE_REGEXP);
8490             if (split_type == SPLIT_TYPE_AWK) {
8491                 spat = tmp;
8492                 split_type = SPLIT_TYPE_STRING;
8493             }
8494             break;
8495
8496           case T_STRING:
8497             mustnot_broken(spat);
8498             split_type = literal_split_pattern(spat, SPLIT_TYPE_STRING);
8499             break;
8500
8501           default:
8502             UNREACHABLE_RETURN(Qnil);
8503         }
8504     }
8505
8506 #define SPLIT_STR(beg, len) (empty_count = split_string(result, str, beg, len, empty_count))
8507
8508     if (result) result = rb_ary_new();
8509     beg = 0;
8510     char *ptr = RSTRING_PTR(str);
8511     char *eptr = RSTRING_END(str);
8512     if (split_type == SPLIT_TYPE_AWK) {
8513         char *bptr = ptr;
8514         int skip = 1;
8515         unsigned int c;
8516
8517         end = beg;
8518         if (is_ascii_string(str)) {
8519             while (ptr < eptr) {
8520                 c = (unsigned char)*ptr++;
8521                 if (skip) {
8522                     if (ascii_isspace(c)) {
8523                         beg = ptr - bptr;
8524                     }
8525                     else {
8526                         end = ptr - bptr;
8527                         skip = 0;
8528                         if (!NIL_P(limit) && lim <= i) break;
8529                     }
8530                 }
8531                 else if (ascii_isspace(c)) {
8532                     SPLIT_STR(beg, end-beg);
8533                     skip = 1;
8534                     beg = ptr - bptr;
8535                     if (!NIL_P(limit)) ++i;
8536                 }
8537                 else {
8538                     end = ptr - bptr;
8539                 }
8540             }
8541         }
8542         else {
8543             while (ptr < eptr) {
8544                 int n;
8545
8546                 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
8547                 ptr += n;
8548                 if (skip) {
8549                     if (rb_isspace(c)) {
8550                         beg = ptr - bptr;
8551                     }
8552                     else {
8553                         end = ptr - bptr;
8554                         skip = 0;
8555                         if (!NIL_P(limit) && lim <= i) break;
8556                     }
8557                 }
8558                 else if (rb_isspace(c)) {
8559                     SPLIT_STR(beg, end-beg);
8560                     skip = 1;
8561                     beg = ptr - bptr;
8562                     if (!NIL_P(limit)) ++i;
8563                 }
8564                 else {
8565                     end = ptr - bptr;
8566                 }
8567             }
8568         }
8569     }
8570     else if (split_type == SPLIT_TYPE_STRING) {
8571         char *str_start = ptr;
8572         char *substr_start = ptr;
8573         char *sptr = RSTRING_PTR(spat);
8574         long slen = RSTRING_LEN(spat);
8575
8576         mustnot_broken(str);
8577         enc = rb_enc_check(str, spat);
8578         while (ptr < eptr &&
8579                (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
8580             /* Check we are at the start of a char */
8581             char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
8582             if (t != ptr + end) {
8583                 ptr = t;
8584                 continue;
8585             }
8586             SPLIT_STR(substr_start - str_start, (ptr+end) - substr_start);
8587             ptr += end + slen;
8588             substr_start = ptr;
8589             if (!NIL_P(limit) && lim <= ++i) break;
8590         }
8591         beg = ptr - str_start;
8592     }
8593     else if (split_type == SPLIT_TYPE_CHARS) {
8594         char *str_start = ptr;
8595         int n;
8596
8597         mustnot_broken(str);
8598         enc = rb_enc_get(str);
8599         while (ptr < eptr &&
8600                (n = rb_enc_precise_mbclen(ptr, eptr, enc)) > 0) {
8601             SPLIT_STR(ptr - str_start, n);
8602             ptr += n;
8603             if (!NIL_P(limit) && lim <= ++i) break;
8604         }
8605         beg = ptr - str_start;
8606     }
8607     else {
8608         long len = RSTRING_LEN(str);
8609         long start = beg;
8610         long idx;
8611         int last_null = 0;
8612         struct re_registers *regs;
8613         VALUE match = 0;
8614
8615         for (; rb_reg_search(spat, str, start, 0) >= 0;
8616              (match ? (rb_match_unbusy(match), rb_backref_set(match)) : (void)0)) {
8617             match = rb_backref_get();
8618             if (!result) rb_match_busy(match);
8619             regs = RMATCH_REGS(match);
8620             end = BEG(0);
8621             if (start == end && BEG(0) == END(0)) {
8622                 if (!ptr) {
8623                     SPLIT_STR(0, 0);
8624                     break;
8625                 }
8626                 else if (last_null == 1) {
8627                     SPLIT_STR(beg, rb_enc_fast_mbclen(ptr+beg, eptr, enc));
8628                     beg = start;
8629                 }
8630                 else {
8631                     if (start == len)
8632                         start++;
8633                     else
8634                         start += rb_enc_fast_mbclen(ptr+start,eptr,enc);
8635                     last_null = 1;
8636                     continue;
8637                 }
8638             }
8639             else {
8640                 SPLIT_STR(beg, end-beg);
8641                 beg = start = END(0);
8642             }
8643             last_null = 0;
8644
8645             for (idx=1; idx < regs->num_regs; idx++) {
8646                 if (BEG(idx) == -1) continue;
8647                 SPLIT_STR(BEG(idx), END(idx)-BEG(idx));
8648             }
8649             if (!NIL_P(limit) && lim <= ++i) break;
8650         }
8651         if (match) rb_match_unbusy(match);
8652     }
8653     if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
8654         SPLIT_STR(beg, RSTRING_LEN(str)-beg);
8655     }
8656
8657     return result ? result : str;
8658 }
8659
8660 VALUE
8661 rb_str_split(VALUE str, const char *sep0)
8662 {
8663     VALUE sep;
8664
8665     StringValue(str);
8666     sep = rb_str_new_cstr(sep0);
8667     return rb_str_split_m(1, &sep, str);
8668 }
8669
8670 #define WANTARRAY(m, size) (!rb_block_given_p() ? rb_ary_new_capa(size) : 0)
8671
8672 static inline int
8673 enumerator_element(VALUE ary, VALUE e)
8674 {
8675     if (ary) {
8676         rb_ary_push(ary, e);
8677         return 0;
8678     }
8679     else {
8680         rb_yield(e);
8681         return 1;
8682     }
8683 }
8684
8685 #define ENUM_ELEM(ary, e) enumerator_element(ary, e)
8686
8687 static const char *
8688 chomp_newline(const char *p, const char *e, rb_encoding *enc)
8689 {
8690     const char *prev = rb_enc_prev_char(p, e, e, enc);
8691     if (rb_enc_is_newline(prev, e, enc)) {
8692         e = prev;
8693         prev = rb_enc_prev_char(p, e, e, enc);
8694         if (prev && rb_enc_ascget(prev, e, NULL, enc) == '\r')
8695             e = prev;
8696     }
8697     return e;
8698 }
8699
8700 static VALUE
8701 get_rs(void)
8702 {
8703     VALUE rs = rb_rs;
8704     if (!NIL_P(rs) &&
8705         (!RB_TYPE_P(rs, T_STRING) ||
8706          RSTRING_LEN(rs) != 1 ||
8707          RSTRING_PTR(rs)[0] != '\n')) {
8708         rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$/ is set to non-default value");
8709     }
8710     return rs;
8711 }
8712
8713 #define rb_rs get_rs()
8714
8715 static VALUE
8716 rb_str_enumerate_lines(int argc, VALUE *argv, VALUE str, VALUE ary)
8717 {
8718     rb_encoding *enc;
8719     VALUE line, rs, orig = str, opts = Qnil, chomp = Qfalse;
8720     const char *ptr, *pend, *subptr, *subend, *rsptr, *hit, *adjusted;
8721     long pos, len, rslen;
8722     int rsnewline = 0;
8723
8724     if (rb_scan_args(argc, argv, "01:", &rs, &opts) == 0)
8725         rs = rb_rs;
8726     if (!NIL_P(opts)) {
8727         static ID keywords[1];
8728         if (!keywords[0]) {
8729             keywords[0] = rb_intern_const("chomp");
8730         }
8731         rb_get_kwargs(opts, keywords, 0, 1, &chomp);
8732         chomp = (chomp != Qundef && RTEST(chomp));
8733     }
8734
8735     if (NIL_P(rs)) {
8736         if (!ENUM_ELEM(ary, str)) {
8737             return ary;
8738         }
8739         else {
8740             return orig;
8741         }
8742     }
8743
8744     if (!RSTRING_LEN(str)) goto end;
8745     str = rb_str_new_frozen(str);
8746     ptr = subptr = RSTRING_PTR(str);
8747     pend = RSTRING_END(str);
8748     len = RSTRING_LEN(str);
8749     StringValue(rs);
8750     rslen = RSTRING_LEN(rs);
8751
8752     if (rs == rb_default_rs)
8753         enc = rb_enc_get(str);
8754     else
8755         enc = rb_enc_check(str, rs);
8756
8757     if (rslen == 0) {
8758         /* paragraph mode */
8759         int n;
8760         const char *eol = NULL;
8761         subend = subptr;
8762         while (subend < pend) {
8763             do {
8764                 if (rb_enc_ascget(subend, pend, &n, enc) != '\r')
8765                     n = 0;
8766                 rslen = n + rb_enc_mbclen(subend + n, pend, enc);
8767                 if (rb_enc_is_newline(subend + n, pend, enc)) {
8768                     if (eol == subend) break;
8769                     subend += rslen;
8770                     if (subptr) eol = subend;
8771                 }
8772                 else {
8773                     if (!subptr) subptr = subend;
8774                     subend += rslen;
8775                 }
8776                 rslen = 0;
8777             } while (subend < pend);
8778             if (!subptr) break;
8779             line = rb_str_subseq(str, subptr - ptr,
8780                                  subend - subptr + (chomp ? 0 : rslen));
8781             if (ENUM_ELEM(ary, line)) {
8782                 str_mod_check(str, ptr, len);
8783             }
8784             subptr = eol = NULL;
8785         }
8786         goto end;
8787     }
8788     else {
8789         rsptr = RSTRING_PTR(rs);
8790         if (RSTRING_LEN(rs) == rb_enc_mbminlen(enc) &&
8791             rb_enc_is_newline(rsptr, rsptr + RSTRING_LEN(rs), enc)) {
8792             rsnewline = 1;
8793         }
8794     }
8795
8796     if ((rs == rb_default_rs) && !rb_enc_asciicompat(enc)) {
8797         rs = rb_str_new(rsptr, rslen);
8798         rs = rb_str_encode(rs, rb_enc_from_encoding(enc), 0, Qnil);
8799         rsptr = RSTRING_PTR(rs);
8800         rslen = RSTRING_LEN(rs);
8801     }
8802
8803     while (subptr < pend) {
8804         pos = rb_memsearch(rsptr, rslen, subptr, pend - subptr, enc);
8805         if (pos < 0) break;
8806         hit = subptr + pos;
8807         adjusted = rb_enc_right_char_head(subptr, hit, pend, enc);
8808         if (hit != adjusted) {
8809             subptr = adjusted;
8810             continue;
8811         }
8812         subend = hit += rslen;
8813         if (chomp) {
8814             if (rsnewline) {
8815                 subend = chomp_newline(subptr, subend, enc);
8816             }
8817             else {
8818                 subend -= rslen;
8819             }
8820         }
8821         line = rb_str_subseq(str, subptr - ptr, subend - subptr);
8822         if (ENUM_ELEM(ary, line)) {
8823             str_mod_check(str, ptr, len);
8824         }
8825         subptr = hit;
8826     }
8827
8828     if (subptr != pend) {
8829         if (chomp) {
8830             if (rsnewline) {
8831                 pend = chomp_newline(subptr, pend, enc);
8832             }
8833             else if (pend - subptr >= rslen &&
8834                      memcmp(pend - rslen, rsptr, rslen) == 0) {
8835                 pend -= rslen;
8836             }
8837         }
8838         line = rb_str_subseq(str, subptr - ptr, pend - subptr);
8839         ENUM_ELEM(ary, line);
8840         RB_GC_GUARD(str);
8841     }
8842
8843   end:
8844     if (ary)
8845         return ary;
8846     else
8847         return orig;
8848 }
8849
8850 /*
8851  *  call-seq:
8852  *     str.each_line(separator=$/, chomp: false) {|substr| block } -> str
8853  *     str.each_line(separator=$/, chomp: false)                   -> an_enumerator
8854  *
8855  *  Splits <i>str</i> using the supplied parameter as the record
8856  *  separator (<code>$/</code> by default), passing each substring in
8857  *  turn to the supplied block.  If a zero-length record separator is
8858  *  supplied, the string is split into paragraphs delimited by
8859  *  multiple successive newlines.
8860  *
8861  *  If +chomp+ is +true+, +separator+ will be removed from the end of each
8862  *  line.
8863  *
8864  *  If no block is given, an enumerator is returned instead.
8865  *
8866  *     "hello\nworld".each_line {|s| p s}
8867  *     # prints:
8868  *     #   "hello\n"
8869  *     #   "world"
8870  *
8871  *     "hello\nworld".each_line('l') {|s| p s}
8872  *     # prints:
8873  *     #   "hel"
8874  *     #   "l"
8875  *     #   "o\nworl"
8876  *     #   "d"
8877  *
8878  *     "hello\n\n\nworld".each_line('') {|s| p s}
8879  *     # prints
8880  *     #   "hello\n\n"
8881  *     #   "world"
8882  *
8883  *     "hello\nworld".each_line(chomp: true) {|s| p s}
8884  *     # prints:
8885  *     #   "hello"
8886  *     #   "world"
8887  *
8888  *     "hello\nworld".each_line('l', chomp: true) {|s| p s}
8889  *     # prints:
8890  *     #   "he"
8891  *     #   ""
8892  *     #   "o\nwor"
8893  *     #   "d"
8894  *
8895  */
8896
8897 static VALUE
8898 rb_str_each_line(int argc, VALUE *argv, VALUE str)
8899 {
8900     RETURN_SIZED_ENUMERATOR(str, argc, argv, 0);
8901     return rb_str_enumerate_lines(argc, argv, str, 0);
8902 }
8903
8904 /*
8905  *  call-seq:
8906  *     str.lines(separator=$/, chomp: false)  -> an_array
8907  *
8908  *  Returns an array of lines in <i>str</i> split using the supplied
8909  *  record separator (<code>$/</code> by default).  This is a
8910  *  shorthand for <code>str.each_line(separator, getline_args).to_a</code>.
8911  *
8912  *  If +chomp+ is +true+, +separator+ will be removed from the end of each
8913  *  line.
8914  *
8915  *     "hello\nworld\n".lines              #=> ["hello\n", "world\n"]
8916  *     "hello  world".lines(' ')           #=> ["hello ", " ", "world"]
8917  *     "hello\nworld\n".lines(chomp: true) #=> ["hello", "world"]
8918  *
8919  *  If a block is given, which is a deprecated form, works the same as
8920  *  <code>each_line</code>.
8921  */
8922
8923 static VALUE
8924 rb_str_lines(int argc, VALUE *argv, VALUE str)
8925 {
8926     VALUE ary = WANTARRAY("lines", 0);
8927     return rb_str_enumerate_lines(argc, argv, str, ary);
8928 }
8929
8930 static VALUE
8931 rb_str_each_byte_size(VALUE str, VALUE args, VALUE eobj)
8932 {
8933     return LONG2FIX(RSTRING_LEN(str));
8934 }
8935
8936 static VALUE
8937 rb_str_enumerate_bytes(VALUE str, VALUE ary)
8938 {
8939     long i;
8940
8941     for (i=0; i<RSTRING_LEN(str); i++) {
8942         ENUM_ELEM(ary, INT2FIX((unsigned char)RSTRING_PTR(str)[i]));
8943     }
8944     if (ary)
8945         return ary;
8946     else
8947         return str;
8948 }
8949
8950 /*
8951  *  call-seq:
8952  *     str.each_byte {|integer| block }    -> str
8953  *     str.each_byte                      -> an_enumerator
8954  *
8955  *  Passes each byte in <i>str</i> to the given block, or returns an
8956  *  enumerator if no block is given.
8957  *
8958  *     "hello".each_byte {|c| print c, ' ' }
8959  *
8960  *  <em>produces:</em>
8961  *
8962  *     104 101 108 108 111
8963  */
8964
8965 static VALUE
8966 rb_str_each_byte(VALUE str)
8967 {
8968     RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_byte_size);
8969     return rb_str_enumerate_bytes(str, 0);
8970 }
8971
8972 /*
8973  *  call-seq:
8974  *     str.bytes    -> an_array
8975  *
8976  *  Returns an array of bytes in <i>str</i>.  This is a shorthand for
8977  *  <code>str.each_byte.to_a</code>.
8978  *
8979  *  If a block is given, which is a deprecated form, works the same as
8980  *  <code>each_byte</code>.
8981  */
8982
8983 static VALUE
8984 rb_str_bytes(VALUE str)
8985 {
8986     VALUE ary = WANTARRAY("bytes", RSTRING_LEN(str));
8987     return rb_str_enumerate_bytes(str, ary);
8988 }
8989
8990 static VALUE
8991 rb_str_each_char_size(VALUE str, VALUE args, VALUE eobj)
8992 {
8993     return rb_str_length(str);
8994 }
8995
8996 static VALUE
8997 rb_str_enumerate_chars(VALUE str, VALUE ary)
8998 {
8999     VALUE orig = str;
9000     long i, len, n;
9001     const char *ptr;
9002     rb_encoding *enc;
9003
9004     str = rb_str_new_frozen(str);
9005     ptr = RSTRING_PTR(str);
9006     len = RSTRING_LEN(str);
9007     enc = rb_enc_get(str);
9008
9009     if (ENC_CODERANGE_CLEAN_P(ENC_CODERANGE(str))) {
9010         for (i = 0; i < len; i += n) {
9011             n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
9012             ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9013         }
9014     }
9015     else {
9016         for (i = 0; i < len; i += n) {
9017             n = rb_enc_mbclen(ptr + i, ptr + len, enc);
9018             ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9019         }
9020     }
9021     RB_GC_GUARD(str);
9022     if (ary)
9023         return ary;
9024     else
9025         return orig;
9026 }
9027
9028 /*
9029  *  call-seq:
9030  *     str.each_char {|cstr| block }    -> str
9031  *     str.each_char                    -> an_enumerator
9032  *
9033  *  Passes each character in <i>str</i> to the given block, or returns
9034  *  an enumerator if no block is given.
9035  *
9036  *     "hello".each_char {|c| print c, ' ' }
9037  *
9038  *  <em>produces:</em>
9039  *
9040  *     h e l l o
9041  */
9042
9043 static VALUE
9044 rb_str_each_char(VALUE str)
9045 {
9046     RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9047     return rb_str_enumerate_chars(str, 0);
9048 }
9049
9050 /*
9051  *  call-seq:
9052  *     str.chars    -> an_array
9053  *
9054  *  Returns an array of characters in <i>str</i>.  This is a shorthand
9055  *  for <code>str.each_char.to_a</code>.
9056  *
9057  *  If a block is given, which is a deprecated form, works the same as
9058  *  <code>each_char</code>.
9059  */
9060
9061 static VALUE
9062 rb_str_chars(VALUE str)
9063 {
9064     VALUE ary = WANTARRAY("chars", rb_str_strlen(str));
9065     return rb_str_enumerate_chars(str, ary);
9066 }
9067
9068 static VALUE
9069 rb_str_enumerate_codepoints(VALUE str, VALUE ary)
9070 {
9071     VALUE orig = str;
9072     int n;
9073     unsigned int c;
9074     const char *ptr, *end;
9075     rb_encoding *enc;
9076
9077     if (single_byte_optimizable(str))
9078         return rb_str_enumerate_bytes(str, ary);
9079
9080     str = rb_str_new_frozen(str);
9081     ptr = RSTRING_PTR(str);
9082     end = RSTRING_END(str);
9083     enc = STR_ENC_GET(str);
9084
9085     while (ptr < end) {
9086         c = rb_enc_codepoint_len(ptr, end, &n, enc);
9087         ENUM_ELEM(ary, UINT2NUM(c));
9088         ptr += n;
9089     }
9090     RB_GC_GUARD(str);
9091     if (ary)
9092         return ary;
9093     else
9094         return orig;
9095 }
9096
9097 /*
9098  *  call-seq:
9099  *     str.each_codepoint {|integer| block }    -> str
9100  *     str.each_codepoint                       -> an_enumerator
9101  *
9102  *  Passes the Integer ordinal of each character in <i>str</i>,
9103  *  also known as a <i>codepoint</i> when applied to Unicode strings to the
9104  *  given block.  For encodings other than UTF-8/UTF-16(BE|LE)/UTF-32(BE|LE),
9105  *  values are directly derived from the binary representation
9106  *  of each character.
9107  *
9108  *  If no block is given, an enumerator is returned instead.
9109  *
9110  *     "hello\u0639".each_codepoint {|c| print c, ' ' }
9111  *
9112  *  <em>produces:</em>
9113  *
9114  *     104 101 108 108 111 1593
9115  */
9116
9117 static VALUE
9118 rb_str_each_codepoint(VALUE str)
9119 {
9120     RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9121     return rb_str_enumerate_codepoints(str, 0);
9122 }
9123
9124 /*
9125  *  call-seq:
9126  *     str.codepoints   -> an_array
9127  *
9128  *  Returns an array of the Integer ordinals of the
9129  *  characters in <i>str</i>.  This is a shorthand for
9130  *  <code>str.each_codepoint.to_a</code>.
9131  *
9132  *  If a block is given, which is a deprecated form, works the same as
9133  *  <code>each_codepoint</code>.
9134  */
9135
9136 static VALUE
9137 rb_str_codepoints(VALUE str)
9138 {
9139     VALUE ary = WANTARRAY("codepoints", rb_str_strlen(str));
9140     return rb_str_enumerate_codepoints(str, ary);
9141 }
9142
9143 static regex_t *
9144 get_reg_grapheme_cluster(rb_encoding *enc)
9145 {
9146     int encidx = rb_enc_to_index(enc);
9147     regex_t *reg_grapheme_cluster = NULL;
9148     static regex_t *reg_grapheme_cluster_utf8 = NULL;
9149
9150     /* synchronize */
9151     if (encidx == rb_utf8_encindex() && reg_grapheme_cluster_utf8) {
9152         reg_grapheme_cluster = reg_grapheme_cluster_utf8;
9153     }
9154     if (!reg_grapheme_cluster) {
9155         const OnigUChar source_ascii[] = "\\X";
9156         OnigErrorInfo einfo;
9157         const OnigUChar *source = source_ascii;
9158         size_t source_len = sizeof(source_ascii) - 1;
9159         switch (encidx) {
9160 #define CHARS_16BE(x) (OnigUChar)((x)>>8), (OnigUChar)(x)
9161 #define CHARS_16LE(x) (OnigUChar)(x), (OnigUChar)((x)>>8)
9162 #define CHARS_32BE(x) CHARS_16BE((x)>>16), CHARS_16BE(x)
9163 #define CHARS_32LE(x) CHARS_16LE(x), CHARS_16LE((x)>>16)
9164 #define CASE_UTF(e) \
9165           case ENCINDEX_UTF_##e: { \
9166             static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \
9167             source = source_UTF_##e; \
9168             source_len = sizeof(source_UTF_##e); \
9169             break; \
9170           }
9171             CASE_UTF(16BE); CASE_UTF(16LE); CASE_UTF(32BE); CASE_UTF(32LE);
9172 #undef CASE_UTF
9173 #undef CHARS_16BE
9174 #undef CHARS_16LE
9175 #undef CHARS_32BE
9176 #undef CHARS_32LE
9177         }
9178         int r = onig_new(&reg_grapheme_cluster, source, source + source_len,
9179                          ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, &einfo);
9180         if (r) {
9181             UChar message[ONIG_MAX_ERROR_MESSAGE_LEN];
9182             onig_error_code_to_str(message, r, &einfo);
9183             rb_fatal("cannot compile grapheme cluster regexp: %s", (char *)message);
9184         }
9185         if (encidx == rb_utf8_encindex()) {
9186             reg_grapheme_cluster_utf8 = reg_grapheme_cluster;
9187         }
9188     }
9189     return reg_grapheme_cluster;
9190 }
9191
9192 static VALUE
9193 rb_str_each_grapheme_cluster_size(VALUE str, VALUE args, VALUE eobj)
9194 {
9195     size_t grapheme_cluster_count = 0;
9196     regex_t *reg_grapheme_cluster = NULL;
9197     rb_encoding *enc = rb_enc_from_index(ENCODING_GET(str));
9198     const char *ptr, *end;
9199
9200     if (!rb_enc_unicode_p(enc)) {
9201         return rb_str_length(str);
9202     }
9203
9204     reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9205     ptr = RSTRING_PTR(str);
9206     end = RSTRING_END(str);
9207
9208     while (ptr < end) {
9209         OnigPosition len = onig_match(reg_grapheme_cluster,
9210                                       (const OnigUChar *)ptr, (const OnigUChar *)end,
9211                                       (const OnigUChar *)ptr, NULL, 0);
9212         if (len <= 0) break;
9213         grapheme_cluster_count++;
9214         ptr += len;
9215     }
9216
9217     return SIZET2NUM(grapheme_cluster_count);
9218 }
9219
9220 static VALUE
9221 rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary)
9222 {
9223     VALUE orig = str;
9224     regex_t *reg_grapheme_cluster = NULL;
9225     rb_encoding *enc = rb_enc_from_index(ENCODING_GET(str));
9226     const char *ptr0, *ptr, *end;
9227
9228     if (!rb_enc_unicode_p(enc)) {
9229         return rb_str_enumerate_chars(str, ary);
9230     }
9231
9232     if (!ary) str = rb_str_new_frozen(str);
9233     reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9234     ptr0 = ptr = RSTRING_PTR(str);
9235     end = RSTRING_END(str);
9236
9237     while (ptr < end) {
9238         OnigPosition len = onig_match(reg_grapheme_cluster,
9239                                       (const OnigUChar *)ptr, (const OnigUChar *)end,
9240                                       (const OnigUChar *)ptr, NULL, 0);
9241         if (len <= 0) break;
9242         ENUM_ELEM(ary, rb_str_subseq(str, ptr-ptr0, len));
9243         ptr += len;
9244     }
9245     RB_GC_GUARD(str);
9246     if (ary)
9247         return ary;
9248     else
9249         return orig;
9250 }
9251
9252 /*
9253  *  call-seq:
9254  *     str.each_grapheme_cluster {|cstr| block }    -> str
9255  *     str.each_grapheme_cluster                    -> an_enumerator
9256  *
9257  *  Passes each grapheme cluster in <i>str</i> to the given block, or returns
9258  *  an enumerator if no block is given.
9259  *  Unlike String#each_char, this enumerates by grapheme clusters defined by
9260  *  Unicode Standard Annex #29 http://unicode.org/reports/tr29/
9261  *
9262  *     "a\u0300".each_char.to_a.size #=> 2
9263  *     "a\u0300".each_grapheme_cluster.to_a.size #=> 1
9264  *
9265  */
9266
9267 static VALUE
9268 rb_str_each_grapheme_cluster(VALUE str)
9269 {
9270     RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_grapheme_cluster_size);
9271     return rb_str_enumerate_grapheme_clusters(str, 0);
9272 }
9273
9274 /*
9275  *  call-seq:
9276  *     str.grapheme_clusters   -> an_array
9277  *
9278  *  Returns an array of grapheme clusters in <i>str</i>.  This is a shorthand
9279  *  for <code>str.each_grapheme_cluster.to_a</code>.
9280  *
9281  *  If a block is given, which is a deprecated form, works the same as
9282  *  <code>each_grapheme_cluster</code>.
9283  */
9284
9285 static VALUE
9286 rb_str_grapheme_clusters(VALUE str)
9287 {
9288     VALUE ary = WANTARRAY("grapheme_clusters", rb_str_strlen(str));
9289     return rb_str_enumerate_grapheme_clusters(str, ary);
9290 }
9291
9292 static long
9293 chopped_length(VALUE str)
9294 {
9295     rb_encoding *enc = STR_ENC_GET(str);
9296     const char *p, *p2, *beg, *end;
9297
9298     beg = RSTRING_PTR(str);
9299     end = beg + RSTRING_LEN(str);
9300     if (beg >= end) return 0;
9301     p = rb_enc_prev_char(beg, end, end, enc);
9302     if (!p) return 0;
9303     if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
9304         p2 = rb_enc_prev_char(beg, p, end, enc);
9305         if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
9306     }
9307     return p - beg;
9308 }
9309
9310 /*
9311  *  call-seq:
9312  *     str.chop!   -> str or nil
9313  *
9314  *  Processes <i>str</i> as for String#chop, returning <i>str</i>, or
9315  *  <code>nil</code> if <i>str</i> is the empty string.  See also
9316  *  String#chomp!.
9317  */
9318
9319 static VALUE
9320 rb_str_chop_bang(VALUE str)
9321 {
9322     str_modify_keep_cr(str);
9323     if (RSTRING_LEN(str) > 0) {
9324         long len;
9325         len = chopped_length(str);
9326         STR_SET_LEN(str, len);
9327         TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
9328         if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
9329             ENC_CODERANGE_CLEAR(str);
9330         }
9331         return str;
9332     }
9333     return Qnil;
9334 }
9335
9336
9337 /*
9338  *  call-seq:
9339  *     str.chop   -> new_str
9340  *
9341  *  Returns a new String with the last character removed.  If the
9342  *  string ends with <code>\r\n</code>, both characters are
9343  *  removed. Applying <code>chop</code> to an empty string returns an
9344  *  empty string. String#chomp is often a safer alternative, as it
9345  *  leaves the string unchanged if it doesn't end in a record
9346  *  separator.
9347  *
9348  *     "string\r\n".chop   #=> "string"
9349  *     "string\n\r".chop   #=> "string\n"
9350  *     "string\n".chop     #=> "string"
9351  *     "string".chop       #=> "strin"
9352  *     "x".chop.chop       #=> ""
9353  */
9354
9355 static VALUE
9356 rb_str_chop(VALUE str)
9357 {
9358     return rb_str_subseq(str, 0, chopped_length(str));
9359 }
9360
9361 static long
9362 smart_chomp(VALUE str, const char *e, const char *p)
9363 {
9364     rb_encoding *enc = rb_enc_get(str);
9365     if (rb_enc_mbminlen(enc) > 1) {
9366         const char *pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
9367         if (rb_enc_is_newline(pp, e, enc)) {
9368             e = pp;
9369         }
9370         pp = e - rb_enc_mbminlen(enc);
9371         if (pp >= p) {
9372             pp = rb_enc_left_char_head(p, pp, e, enc);
9373             if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
9374                 e = pp;
9375             }
9376         }
9377     }
9378     else {
9379         switch (*(e-1)) { /* not e[-1] to get rid of VC bug */
9380           case '\n':
9381             if (--e > p && *(e-1) == '\r') {
9382                 --e;
9383             }
9384             break;
9385           case '\r':
9386             --e;
9387             break;
9388         }
9389     }
9390     return e - p;
9391 }
9392
9393 static long
9394 chompped_length(VALUE str, VALUE rs)
9395 {
9396     rb_encoding *enc;
9397     int newline;
9398     char *pp, *e, *rsptr;
9399     long rslen;
9400     char *const p = RSTRING_PTR(str);
9401     long len = RSTRING_LEN(str);
9402
9403     if (len == 0) return 0;
9404     e = p + len;
9405     if (rs == rb_default_rs) {
9406         return smart_chomp(str, e, p);
9407     }
9408
9409     enc = rb_enc_get(str);
9410     RSTRING_GETMEM(rs, rsptr, rslen);
9411     if (rslen == 0) {
9412         if (rb_enc_mbminlen(enc) > 1) {
9413             while (e > p) {
9414                 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
9415                 if (!rb_enc_is_newline(pp, e, enc)) break;
9416                 e = pp;
9417                 pp -= rb_enc_mbminlen(enc);
9418                 if (pp >= p) {
9419                     pp = rb_enc_left_char_head(p, pp, e, enc);
9420                     if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
9421                         e = pp;
9422                     }
9423                 }
9424             }
9425         }
9426         else {
9427             while (e > p && *(e-1) == '\n') {
9428                 --e;
9429                 if (e > p && *(e-1) == '\r')
9430                     --e;
9431             }
9432         }
9433         return e - p;
9434     }
9435     if (rslen > len) return len;
9436
9437     enc = rb_enc_get(rs);
9438     newline = rsptr[rslen-1];
9439     if (rslen == rb_enc_mbminlen(enc)) {
9440         if (rslen == 1) {
9441             if (newline == '\n')
9442                 return smart_chomp(str, e, p);
9443         }
9444         else {
9445             if (rb_enc_is_newline(rsptr, rsptr+rslen, enc))
9446                 return smart_chomp(str, e, p);
9447         }
9448     }
9449
9450     enc = rb_enc_check(str, rs);
9451     if (is_broken_string(rs)) {
9452         return len;
9453     }
9454     pp = e - rslen;
9455     if (p[len-1] == newline &&
9456         (rslen <= 1 ||
9457          memcmp(rsptr, pp, rslen) == 0)) {
9458         if (rb_enc_left_char_head(p, pp, e, enc) == pp)
9459             return len - rslen;
9460         RB_GC_GUARD(rs);
9461     }
9462     return len;
9463 }
9464
9465 /*!
9466  * Returns the separator for arguments of rb_str_chomp.
9467  *
9468  * @return returns rb_rs ($/) as default, the default value of rb_rs ($/) is "\n".
9469  */
9470 static VALUE
9471 chomp_rs(int argc, const VALUE *argv)
9472 {
9473     rb_check_arity(argc, 0, 1);
9474     if (argc > 0) {
9475         VALUE rs = argv[0];
9476         if (!NIL_P(rs)) StringValue(rs);
9477         return rs;
9478     }
9479     else {
9480         return rb_rs;
9481     }
9482 }
9483
9484 VALUE
9485 rb_str_chomp_string(VALUE str, VALUE rs)
9486 {
9487     long olen = RSTRING_LEN(str);
9488     long len = chompped_length(str, rs);
9489     if (len >= olen) return Qnil;
9490     str_modify_keep_cr(str);
9491     STR_SET_LEN(str, len);
9492     TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
9493     if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
9494         ENC_CODERANGE_CLEAR(str);
9495     }
9496     return str;
9497 }
9498
9499 /*
9500  *  call-seq:
9501  *     str.chomp!(separator=$/)   -> str or nil
9502  *
9503  *  Modifies <i>str</i> in place as described for String#chomp,
9504  *  returning <i>str</i>, or <code>nil</code> if no modifications were
9505  *  made.
9506  */
9507
9508 static VALUE
9509 rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
9510 {
9511     VALUE rs;
9512     str_modifiable(str);
9513     if (RSTRING_LEN(str) == 0) return Qnil;
9514     rs = chomp_rs(argc, argv);
9515     if (NIL_P(rs)) return Qnil;
9516     return rb_str_chomp_string(str, rs);
9517 }
9518
9519
9520 /*
9521  *  call-seq:
9522  *     str.chomp(separator=$/)   -> new_str
9523  *
9524  *  Returns a new String with the given record separator removed
9525  *  from the end of <i>str</i> (if present). If <code>$/</code> has not been
9526  *  changed from the default Ruby record separator, then <code>chomp</code> also
9527  *  removes carriage return characters (that is, it will remove <code>\n</code>,
9528  *  <code>\r</code>, and <code>\r\n</code>). If <code>$/</code> is an empty string,
9529  *  it will remove all trailing newlines from the string.
9530  *
9531  *     "hello".chomp                #=> "hello"
9532  *     "hello\n".chomp              #=> "hello"
9533  *     "hello\r\n".chomp            #=> "hello"
9534  *     "hello\n\r".chomp            #=> "hello\n"
9535  *     "hello\r".chomp              #=> "hello"
9536  *     "hello \n there".chomp       #=> "hello \n there"
9537  *     "hello".chomp("llo")         #=> "he"
9538  *     "hello\r\n\r\n".chomp('')    #=> "hello"
9539  *     "hello\r\n\r\r\n".chomp('')  #=> "hello\r\n\r"
9540  */
9541
9542 static VALUE
9543 rb_str_chomp(int argc, VALUE *argv, VALUE str)
9544 {
9545     VALUE rs = chomp_rs(argc, argv);
9546     if (NIL_P(rs)) return str_duplicate(rb_cString, str);
9547     return rb_str_subseq(str, 0, chompped_length(str, rs));
9548 }
9549
9550 static long
9551 lstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
9552 {
9553     const char *const start = s;
9554
9555     if (!s || s >= e) return 0;
9556
9557     /* remove spaces at head */
9558     if (single_byte_optimizable(str)) {
9559         while (s < e && (*s == '\0' || ascii_isspace(*s))) s++;
9560     }
9561     else {
9562         while (s < e) {
9563             int n;
9564             unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
9565
9566             if (cc && !rb_isspace(cc)) break;
9567             s += n;
9568         }
9569     }
9570     return s - start;
9571 }
9572
9573 /*
9574  *  call-seq:
9575  *     str.lstrip!   -> self or nil
9576  *
9577  *  Removes leading whitespace from the receiver.
9578  *  Returns the altered receiver, or +nil+ if no change was made.
9579  *  See also String#rstrip! and String#strip!.
9580  *
9581  *  Refer to String#strip for the definition of whitespace.
9582  *
9583  *     "  hello  ".lstrip!  #=> "hello  "
9584  *     "hello  ".lstrip!    #=> nil
9585  *     "hello".lstrip!      #=> nil
9586  */
9587
9588 static VALUE
9589 rb_str_lstrip_bang(VALUE str)
9590 {
9591     rb_encoding *enc;
9592     char *start, *s;
9593     long olen, loffset;
9594
9595     str_modify_keep_cr(str);
9596     enc = STR_ENC_GET(str);
9597     RSTRING_GETMEM(str, start, olen);
9598     loffset = lstrip_offset(str, start, start+olen, enc);
9599     if (loffset > 0) {
9600         long len = olen-loffset;
9601         s = start + loffset;
9602         memmove(start, s, len);
9603         STR_SET_LEN(str, len);
9604         TERM_FILL(start+len, rb_enc_mbminlen(enc));
9605         return str;
9606     }
9607     return Qnil;
9608 }
9609
9610
9611 /*
9612  *  call-seq:
9613  *     str.lstrip   -> new_str
9614  *
9615  *  Returns a copy of the receiver with leading whitespace removed.
9616  *  See also String#rstrip and String#strip.
9617  *
9618  *  Refer to String#strip for the definition of whitespace.
9619  *
9620  *     "  hello  ".lstrip   #=> "hello  "
9621  *     "hello".lstrip       #=> "hello"
9622  */
9623
9624 static VALUE
9625 rb_str_lstrip(VALUE str)
9626 {
9627     char *start;
9628     long len, loffset;
9629     RSTRING_GETMEM(str, start, len);
9630     loffset = lstrip_offset(str, start, start+len, STR_ENC_GET(str));
9631     if (loffset <= 0) return str_duplicate(rb_cString, str);
9632     return rb_str_subseq(str, loffset, len - loffset);
9633 }
9634
9635 static long
9636 rstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
9637 {
9638     const char *t;
9639
9640     rb_str_check_dummy_enc(enc);
9641     if (!s || s >= e) return 0;
9642     t = e;
9643
9644     /* remove trailing spaces or '\0's */
9645     if (single_byte_optimizable(str)) {
9646         unsigned char c;
9647         while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--;
9648     }
9649     else {
9650         char *tp;
9651
9652         while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
9653             unsigned int c = rb_enc_codepoint(tp, e, enc);
9654             if (c && !rb_isspace(c)) break;
9655             t = tp;
9656         }
9657     }
9658     return e - t;
9659 }
9660
9661 /*
9662  *  call-seq:
9663  *     str.rstrip!   -> self or nil
9664  *
9665  *  Removes trailing whitespace from the receiver.
9666  *  Returns the altered receiver, or +nil+ if no change was made.
9667  *  See also String#lstrip! and String#strip!.
9668  *
9669  *  Refer to String#strip for the definition of whitespace.
9670  *
9671  *     "  hello  ".rstrip!  #=> "  hello"
9672  *     "  hello".rstrip!    #=> nil
9673  *     "hello".rstrip!      #=> nil
9674  */
9675
9676 static VALUE
9677 rb_str_rstrip_bang(VALUE str)
9678 {
9679     rb_encoding *enc;
9680     char *start;
9681     long olen, roffset;
9682
9683     str_modify_keep_cr(str);
9684     enc = STR_ENC_GET(str);
9685     RSTRING_GETMEM(str, start, olen);
9686     roffset = rstrip_offset(str, start, start+olen, enc);
9687     if (roffset > 0) {
9688         long len = olen - roffset;
9689
9690         STR_SET_LEN(str, len);
9691         TERM_FILL(start+len, rb_enc_mbminlen(enc));
9692         return str;
9693     }
9694     return Qnil;
9695 }
9696
9697
9698 /*
9699  *  call-seq:
9700  *     str.rstrip   -> new_str
9701  *
9702  *  Returns a copy of the receiver with trailing whitespace removed.
9703  *  See also String#lstrip and String#strip.
9704  *
9705  *  Refer to String#strip for the definition of whitespace.
9706  *
9707  *     "  hello  ".rstrip   #=> "  hello"
9708  *     "hello".rstrip       #=> "hello"
9709  */
9710
9711 static VALUE
9712 rb_str_rstrip(VALUE str)
9713 {
9714     rb_encoding *enc;
9715     char *start;
9716     long olen, roffset;
9717
9718     enc = STR_ENC_GET(str);
9719     RSTRING_GETMEM(str, start, olen);
9720     roffset = rstrip_offset(str, start, start+olen, enc);
9721
9722     if (roffset <= 0) return str_duplicate(rb_cString, str);
9723     return rb_str_subseq(str, 0, olen-roffset);
9724 }
9725
9726
9727 /*
9728  *  call-seq:
9729  *     str.strip!   -> self or nil
9730  *
9731  *  Removes leading and trailing whitespace from the receiver.
9732  *  Returns the altered receiver, or +nil+ if there was no change.
9733  *
9734  *  Refer to String#strip for the definition of whitespace.
9735  *
9736  *     "  hello  ".strip!  #=> "hello"
9737  *     "hello".strip!      #=> nil
9738  */
9739
9740 static VALUE
9741 rb_str_strip_bang(VALUE str)
9742 {
9743     char *start;
9744     long olen, loffset, roffset;
9745     rb_encoding *enc;
9746
9747     str_modify_keep_cr(str);
9748     enc = STR_ENC_GET(str);
9749     RSTRING_GETMEM(str, start, olen);
9750     loffset = lstrip_offset(str, start, start+olen, enc);
9751     roffset = rstrip_offset(str, start+loffset, start+olen, enc);
9752
9753     if (loffset > 0 || roffset > 0) {
9754         long len = olen-roffset;
9755         if (loffset > 0) {
9756             len -= loffset;
9757             memmove(start, start + loffset, len);
9758         }
9759         STR_SET_LEN(str, len);
9760         TERM_FILL(start+len, rb_enc_mbminlen(enc));
9761         return str;
9762     }
9763     return Qnil;
9764 }
9765
9766
9767 /*
9768  *  call-seq:
9769  *     str.strip   -> new_str
9770  *
9771  *  Returns a copy of the receiver with leading and trailing whitespace removed.
9772  *
9773  *  Whitespace is defined as any of the following characters:
9774  *  null, horizontal tab, line feed, vertical tab, form feed, carriage return, space.
9775  *
9776  *     "    hello    ".strip   #=> "hello"
9777  *     "\tgoodbye\r\n".strip   #=> "goodbye"
9778  *     "\x00\t\n\v\f\r ".strip #=> ""
9779  *     "hello".strip           #=> "hello"
9780  */
9781
9782 static VALUE
9783 rb_str_strip(VALUE str)
9784 {
9785     char *start;
9786     long olen, loffset, roffset;
9787     rb_encoding *enc = STR_ENC_GET(str);
9788
9789     RSTRING_GETMEM(str, start, olen);
9790     loffset = lstrip_offset(str, start, start+olen, enc);
9791     roffset = rstrip_offset(str, start+loffset, start+olen, enc);
9792
9793     if (loffset <= 0 && roffset <= 0) return str_duplicate(rb_cString, str);
9794     return rb_str_subseq(str, loffset, olen-loffset-roffset);
9795 }
9796
9797 static VALUE
9798 scan_once(VALUE str, VALUE pat, long *start, int set_backref_str)
9799 {
9800     VALUE result, match;
9801     struct re_registers *regs;
9802     int i;
9803     long end, pos = rb_pat_search(pat, str, *start, set_backref_str);
9804     if (pos >= 0) {
9805         if (BUILTIN_TYPE(pat) == T_STRING) {
9806             regs = NULL;
9807             end = pos + RSTRING_LEN(pat);
9808         }
9809         else {
9810             match = rb_backref_get();
9811             regs = RMATCH_REGS(match);
9812             pos = BEG(0);
9813             end = END(0);
9814         }
9815         if (pos == end) {
9816             rb_encoding *enc = STR_ENC_GET(str);
9817             /*
9818              * Always consume at least one character of the input string
9819              */
9820             if (RSTRING_LEN(str) > end)
9821                 *start = end + rb_enc_fast_mbclen(RSTRING_PTR(str) + end,
9822                                                   RSTRING_END(str), enc);
9823             else
9824                 *start = end + 1;
9825         }
9826         else {
9827             *start = end;
9828         }
9829         if (!regs || regs->num_regs == 1) {
9830             result = rb_str_subseq(str, pos, end - pos);
9831             return result;
9832         }
9833         result = rb_ary_new2(regs->num_regs);
9834         for (i=1; i < regs->num_regs; i++) {
9835             VALUE s = Qnil;
9836             if (BEG(i) >= 0) {
9837                 s = rb_str_subseq(str, BEG(i), END(i)-BEG(i));
9838             }
9839             rb_ary_push(result, s);
9840         }
9841
9842         return result;
9843     }
9844     return Qnil;
9845 }
9846
9847
9848 /*
9849  *  call-seq:
9850  *     str.scan(pattern)                         -> array
9851  *     str.scan(pattern) {|match, ...| block }   -> str
9852  *
9853  *  Both forms iterate through <i>str</i>, matching the pattern (which may be a
9854  *  Regexp or a String). For each match, a result is
9855  *  generated and either added to the result array or passed to the block. If
9856  *  the pattern contains no groups, each individual result consists of the
9857  *  matched string, <code>$&</code>.  If the pattern contains groups, each
9858  *  individual result is itself an array containing one entry per group.
9859  *
9860  *     a = "cruel world"
9861  *     a.scan(/\w+/)        #=> ["cruel", "world"]
9862  *     a.scan(/.../)        #=> ["cru", "el ", "wor"]
9863  *     a.scan(/(...)/)      #=> [["cru"], ["el "], ["wor"]]
9864  *     a.scan(/(..)(..)/)   #=> [["cr", "ue"], ["l ", "wo"]]
9865  *
9866  *  And the block form:
9867  *
9868  *     a.scan(/\w+/) {|w| print "<<#{w}>> " }
9869  *     print "\n"
9870  *     a.scan(/(.)(.)/) {|x,y| print y, x }
9871  *     print "\n"
9872  *
9873  *  <em>produces:</em>
9874  *
9875  *     <<cruel>> <<world>>
9876  *     rceu lowlr
9877  */
9878
9879 static VALUE
9880 rb_str_scan(VALUE str, VALUE pat)
9881 {
9882     VALUE result;
9883     long start = 0;
9884     long last = -1, prev = 0;
9885     char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
9886
9887     pat = get_pat_quoted(pat, 1);
9888     mustnot_broken(str);
9889     if (!rb_block_given_p()) {
9890         VALUE ary = rb_ary_new();
9891
9892         while (!NIL_P(result = scan_once(str, pat, &start, 0))) {
9893             last = prev;
9894             prev = start;
9895             rb_ary_push(ary, result);
9896         }
9897         if (last >= 0) rb_pat_search(pat, str, last, 1);
9898         else rb_backref_set(Qnil);
9899         return ary;
9900     }
9901
9902     while (!NIL_P(result = scan_once(str, pat, &start, 1))) {
9903         last = prev;
9904         prev = start;
9905         rb_yield(result);
9906         str_mod_check(str, p, len);
9907     }
9908     if (last >= 0) rb_pat_search(pat, str, last, 1);
9909     return str;
9910 }
9911
9912
9913 /*
9914  *  call-seq:
9915  *     str.hex   -> integer
9916  *
9917  *  Treats leading characters from <i>str</i> as a string of hexadecimal digits
9918  *  (with an optional sign and an optional <code>0x</code>) and returns the
9919  *  corresponding number. Zero is returned on error.
9920  *
9921  *     "0x0a".hex     #=> 10
9922  *     "-1234".hex    #=> -4660
9923  *     "0".hex        #=> 0
9924  *     "wombat".hex   #=> 0
9925  */
9926
9927 static VALUE
9928 rb_str_hex(VALUE str)
9929 {
9930     return rb_str_to_inum(str, 16, FALSE);
9931 }
9932
9933
9934 /*
9935  *  call-seq:
9936  *     str.oct   -> integer
9937  *
9938  *  Treats leading characters of <i>str</i> as a string of octal digits (with an
9939  *  optional sign) and returns the corresponding number.  Returns 0 if the
9940  *  conversion fails.
9941  *
9942  *     "123".oct       #=> 83
9943  *     "-377".oct      #=> -255
9944  *     "bad".oct       #=> 0
9945  *     "0377bad".oct   #=> 255
9946  *
9947  *  If +str+ starts with <code>0</code>, radix indicators are honored.
9948  *  See Kernel#Integer.
9949  */
9950
9951 static VALUE
9952 rb_str_oct(VALUE str)
9953 {
9954     return rb_str_to_inum(str, -8, FALSE);
9955 }
9956
9957 #ifndef HAVE_CRYPT_R
9958 # include "ruby/thread_native.h"
9959 # include "ruby/atomic.h"
9960
9961 static struct {
9962     rb_atomic_t initialized;
9963     rb_nativethread_lock_t lock;
9964 } crypt_mutex;
9965
9966 static void
9967 crypt_mutex_destroy(void)
9968 {
9969     RUBY_ASSERT_ALWAYS(crypt_mutex.initialized == 1);
9970     rb_nativethread_lock_destroy(&crypt_mutex.lock);
9971     crypt_mutex.initialized = 0;
9972 }
9973
9974 static void
9975 crypt_mutex_initialize(void)
9976 {
9977     rb_atomic_t i;
9978     while ((i = RUBY_ATOMIC_CAS(crypt_mutex.initialized, 0, 2)) == 2);
9979     switch (i) {
9980       case 0:
9981         rb_nativethread_lock_initialize(&crypt_mutex.lock);
9982         atexit(crypt_mutex_destroy);
9983         RUBY_ASSERT(crypt_mutex.initialized == 2);
9984         RUBY_ATOMIC_CAS(crypt_mutex.initialized, 2, 1);
9985         break;
9986       case 1:
9987         break;
9988       default:
9989         rb_bug("crypt_mutex.initialized: %d->%d", i, crypt_mutex.initialized);
9990     }
9991 }
9992 #endif
9993
9994 /*
9995  *  call-seq:
9996  *     str.crypt(salt_str)   -> new_str
9997  *
9998  *  Returns the string generated by calling <code>crypt(3)</code>
9999  *  standard library function with <code>str</code> and
10000  *  <code>salt_str</code>, in this order, as its arguments.  Please do
10001  *  not use this method any longer.  It is legacy; provided only for
10002  *  backward compatibility with ruby scripts in earlier days.  It is
10003  *  bad to use in contemporary programs for several reasons:
10004  *
10005  *  * Behaviour of C's <code>crypt(3)</code> depends on the OS it is
10006  *    run.  The generated string lacks data portability.
10007  *
10008  *  * On some OSes such as Mac OS, <code>crypt(3)</code> never fails
10009  *    (i.e. silently ends up in unexpected results).
10010  *
10011  *  * On some OSes such as Mac OS, <code>crypt(3)</code> is not
10012  *    thread safe.
10013  *
10014  *  * So-called "traditional" usage of <code>crypt(3)</code> is very
10015  *    very very weak.  According to its manpage, Linux's traditional
10016  *    <code>crypt(3)</code> output has only 2**56 variations; too
10017  *    easy to brute force today.  And this is the default behaviour.
10018  *
10019  *  * In order to make things robust some OSes implement so-called
10020  *    "modular" usage. To go through, you have to do a complex
10021  *    build-up of the <code>salt_str</code> parameter, by hand.
10022  *    Failure in generation of a proper salt string tends not to
10023  *    yield any errors; typos in parameters are normally not
10024  *    detectable.
10025  *
10026  *    * For instance, in the following example, the second invocation
10027  *      of String#crypt is wrong; it has a typo in "round=" (lacks
10028  *      "s").  However the call does not fail and something unexpected
10029  *      is generated.
10030  *
10031  *         "foo".crypt("$5$rounds=1000$salt$") # OK, proper usage
10032  *         "foo".crypt("$5$round=1000$salt$")  # Typo not detected
10033  *
10034  *  * Even in the "modular" mode, some hash functions are considered
10035  *    archaic and no longer recommended at all; for instance module
10036  *    <code>$1$</code> is officially abandoned by its author: see
10037  *    http://phk.freebsd.dk/sagas/md5crypt_eol/ .  For another
10038  *    instance module <code>$3$</code> is considered completely
10039  *    broken: see the manpage of FreeBSD.
10040  *
10041  *  * On some OS such as Mac OS, there is no modular mode. Yet, as
10042  *    written above, <code>crypt(3)</code> on Mac OS never fails.
10043  *    This means even if you build up a proper salt string it
10044  *    generates a traditional DES hash anyways, and there is no way
10045  *    for you to be aware of.
10046  *
10047  *        "foo".crypt("$5$rounds=1000$salt$") # => "$5fNPQMxC5j6."
10048  *
10049  *  If for some reason you cannot migrate to other secure contemporary
10050  *  password hashing algorithms, install the string-crypt gem and
10051  *  <code>require 'string/crypt'</code> to continue using it.
10052  */
10053
10054 static VALUE
10055 rb_str_crypt(VALUE str, VALUE salt)
10056 {
10057 #ifdef HAVE_CRYPT_R
10058     VALUE databuf;
10059     struct crypt_data *data;
10060 #   define CRYPT_END() ALLOCV_END(databuf)
10061 #else
10062     extern char *crypt(const char *, const char *);
10063 #   define CRYPT_END() rb_nativethread_lock_unlock(&crypt_mutex.lock)
10064 #endif
10065     VALUE result;
10066     const char *s, *saltp;
10067     char *res;
10068 #ifdef BROKEN_CRYPT
10069     char salt_8bit_clean[3];
10070 #endif
10071
10072     StringValue(salt);
10073     mustnot_wchar(str);
10074     mustnot_wchar(salt);
10075     s = StringValueCStr(str);
10076     saltp = RSTRING_PTR(salt);
10077     if (RSTRING_LEN(salt) < 2 || !saltp[0] || !saltp[1]) {
10078         rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
10079     }
10080
10081 #ifdef BROKEN_CRYPT
10082     if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) {
10083         salt_8bit_clean[0] = saltp[0] & 0x7f;
10084         salt_8bit_clean[1] = saltp[1] & 0x7f;
10085         salt_8bit_clean[2] = '\0';
10086         saltp = salt_8bit_clean;
10087     }
10088 #endif
10089 #ifdef HAVE_CRYPT_R
10090     data = ALLOCV(databuf, sizeof(struct crypt_data));
10091 # ifdef HAVE_STRUCT_CRYPT_DATA_INITIALIZED
10092     data->initialized = 0;
10093 # endif
10094     res = crypt_r(s, saltp, data);
10095 #else
10096     crypt_mutex_initialize();
10097     rb_nativethread_lock_lock(&crypt_mutex.lock);
10098     res = crypt(s, saltp);
10099 #endif
10100     if (!res) {
10101         int err = errno;
10102         CRYPT_END();
10103         rb_syserr_fail(err, "crypt");
10104     }
10105     result = rb_str_new_cstr(res);
10106     CRYPT_END();
10107     return result;
10108 }
10109
10110
10111 /*
10112  *  call-seq:
10113  *     str.ord   -> integer
10114  *
10115  *  Returns the Integer ordinal of a one-character string.
10116  *
10117  *     "a".ord         #=> 97
10118  */
10119
10120 static VALUE
10121 rb_str_ord(VALUE s)
10122 {
10123     unsigned int c;
10124
10125     c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s));
10126     return UINT2NUM(c);
10127 }
10128 /*
10129  *  call-seq:
10130  *     str.sum(n=16)   -> integer
10131  *
10132  *  Returns a basic <em>n</em>-bit checksum of the characters in <i>str</i>,
10133  *  where <em>n</em> is the optional Integer parameter, defaulting
10134  *  to 16. The result is simply the sum of the binary value of each byte in
10135  *  <i>str</i> modulo <code>2**n - 1</code>. This is not a particularly good
10136  *  checksum.
10137  */
10138
10139 static VALUE
10140 rb_str_sum(int argc, VALUE *argv, VALUE str)
10141 {
10142     int bits = 16;
10143     char *ptr, *p, *pend;
10144     long len;
10145     VALUE sum = INT2FIX(0);
10146     unsigned long sum0 = 0;
10147
10148     if (rb_check_arity(argc, 0, 1) && (bits = NUM2INT(argv[0])) < 0) {
10149         bits = 0;
10150     }
10151     ptr = p = RSTRING_PTR(str);
10152     len = RSTRING_LEN(str);
10153     pend = p + len;
10154
10155     while (p < pend) {
10156         if (FIXNUM_MAX - UCHAR_MAX < sum0) {
10157             sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10158             str_mod_check(str, ptr, len);
10159             sum0 = 0;
10160         }
10161         sum0 += (unsigned char)*p;
10162         p++;
10163     }
10164
10165     if (bits == 0) {
10166         if (sum0) {
10167             sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10168         }
10169     }
10170     else {
10171         if (sum == INT2FIX(0)) {
10172             if (bits < (int)sizeof(long)*CHAR_BIT) {
10173                 sum0 &= (((unsigned long)1)<<bits)-1;
10174             }
10175             sum = LONG2FIX(sum0);
10176         }
10177         else {
10178             VALUE mod;
10179
10180             if (sum0) {
10181                 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10182             }
10183
10184             mod = rb_funcall(INT2FIX(1), idLTLT, 1, INT2FIX(bits));
10185             mod = rb_funcall(mod, '-', 1, INT2FIX(1));
10186             sum = rb_funcall(sum, '&', 1, mod);
10187         }
10188     }
10189     return sum;
10190 }
10191
10192 static VALUE
10193 rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
10194 {
10195     rb_encoding *enc;
10196     VALUE w;
10197     long width, len, flen = 1, fclen = 1;
10198     VALUE res;
10199     char *p;
10200     const char *f = " ";
10201     long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
10202     VALUE pad;
10203     int singlebyte = 1, cr;
10204     int termlen;
10205
10206     rb_scan_args(argc, argv, "11", &w, &pad);
10207     enc = STR_ENC_GET(str);
10208     termlen = rb_enc_mbminlen(enc);
10209     width = NUM2LONG(w);
10210     if (argc == 2) {
10211         StringValue(pad);
10212         enc = rb_enc_check(str, pad);
10213         f = RSTRING_PTR(pad);
10214         flen = RSTRING_LEN(pad);
10215         fclen = str_strlen(pad, enc); /* rb_enc_check */
10216         singlebyte = single_byte_optimizable(pad);
10217         if (flen == 0 || fclen == 0) {
10218             rb_raise(rb_eArgError, "zero width padding");
10219         }
10220     }
10221     len = str_strlen(str, enc); /* rb_enc_check */
10222     if (width < 0 || len >= width) return str_duplicate(rb_cString, str);
10223     n = width - len;
10224     llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
10225     rlen = n - llen;
10226     cr = ENC_CODERANGE(str);
10227     if (flen > 1) {
10228        llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
10229        rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
10230     }
10231     size = RSTRING_LEN(str);
10232     if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
10233        (len *= flen) >= LONG_MAX - llen2 - rlen2 ||
10234        (len += llen2 + rlen2) >= LONG_MAX - size) {
10235        rb_raise(rb_eArgError, "argument too big");
10236     }
10237     len += size;
10238     res = str_new0(rb_cString, 0, len, termlen);
10239     p = RSTRING_PTR(res);
10240     if (flen <= 1) {
10241        memset(p, *f, llen);
10242        p += llen;
10243     }
10244     else {
10245        while (llen >= fclen) {
10246             memcpy(p,f,flen);
10247             p += flen;
10248             llen -= fclen;
10249         }
10250        if (llen > 0) {
10251            memcpy(p, f, llen2);
10252            p += llen2;
10253         }
10254     }
10255     memcpy(p, RSTRING_PTR(str), size);
10256     p += size;
10257     if (flen <= 1) {
10258        memset(p, *f, rlen);
10259        p += rlen;
10260     }
10261     else {
10262        while (rlen >= fclen) {
10263             memcpy(p,f,flen);
10264             p += flen;
10265             rlen -= fclen;
10266         }
10267        if (rlen > 0) {
10268            memcpy(p, f, rlen2);
10269            p += rlen2;
10270         }
10271     }
10272     TERM_FILL(p, termlen);
10273     STR_SET_LEN(res, p-RSTRING_PTR(res));
10274     rb_enc_associate(res, enc);
10275     if (argc == 2)
10276         cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
10277     if (cr != ENC_CODERANGE_BROKEN)
10278         ENC_CODERANGE_SET(res, cr);
10279
10280     RB_GC_GUARD(pad);
10281     return res;
10282 }
10283
10284
10285 /*
10286  *  call-seq:
10287  *     str.ljust(integer, padstr=' ')   -> new_str
10288  *
10289  *  If <i>integer</i> is greater than the length of <i>str</i>, returns a new
10290  *  String of length <i>integer</i> with <i>str</i> left justified
10291  *  and padded with <i>padstr</i>; otherwise, returns <i>str</i>.
10292  *
10293  *     "hello".ljust(4)            #=> "hello"
10294  *     "hello".ljust(20)           #=> "hello               "
10295  *     "hello".ljust(20, '1234')   #=> "hello123412341234123"
10296  */
10297
10298 static VALUE
10299 rb_str_ljust(int argc, VALUE *argv, VALUE str)
10300 {
10301     return rb_str_justify(argc, argv, str, 'l');
10302 }
10303
10304
10305 /*
10306  *  call-seq:
10307  *     str.rjust(integer, padstr=' ')   -> new_str
10308  *
10309  *  If <i>integer</i> is greater than the length of <i>str</i>, returns a new
10310  *  String of length <i>integer</i> with <i>str</i> right justified
10311  *  and padded with <i>padstr</i>; otherwise, returns <i>str</i>.
10312  *
10313  *     "hello".rjust(4)            #=> "hello"
10314  *     "hello".rjust(20)           #=> "               hello"
10315  *     "hello".rjust(20, '1234')   #=> "123412341234123hello"
10316  */
10317
10318 static VALUE
10319 rb_str_rjust(int argc, VALUE *argv, VALUE str)
10320 {
10321     return rb_str_justify(argc, argv, str, 'r');
10322 }
10323
10324
10325 /*
10326  *  call-seq:
10327  *     str.center(width, padstr=' ')   -> new_str
10328  *
10329  *  Centers +str+ in +width+.  If +width+ is greater than the length of +str+,
10330  *  returns a new String of length +width+ with +str+ centered and padded with
10331  *  +padstr+; otherwise, returns +str+.
10332  *
10333  *     "hello".center(4)         #=> "hello"
10334  *     "hello".center(20)        #=> "       hello        "
10335  *     "hello".center(20, '123') #=> "1231231hello12312312"
10336  */
10337
10338 static VALUE
10339 rb_str_center(int argc, VALUE *argv, VALUE str)
10340 {
10341     return rb_str_justify(argc, argv, str, 'c');
10342 }
10343
10344 /*
10345  *  call-seq:
10346  *     str.partition(sep)              -> [head, sep, tail]
10347  *     str.partition(regexp)           -> [head, match, tail]
10348  *
10349  *  Searches <i>sep</i> or pattern (<i>regexp</i>) in the string
10350  *  and returns the part before it, the match, and the part
10351  *  after it.
10352  *  If it is not found, returns two empty strings and <i>str</i>.
10353  *
10354  *     "hello".partition("l")         #=> ["he", "l", "lo"]
10355  *     "hello".partition("x")         #=> ["hello", "", ""]
10356  *     "hello".partition(/.l/)        #=> ["h", "el", "lo"]
10357  */
10358
10359 static VALUE
10360 rb_str_partition(VALUE str, VALUE sep)
10361 {
10362     long pos;
10363
10364     sep = get_pat_quoted(sep, 0);
10365     if (RB_TYPE_P(sep, T_REGEXP)) {
10366         if (rb_reg_search(sep, str, 0, 0) < 0) {
10367             goto failed;
10368         }
10369         VALUE match = rb_backref_get();
10370         struct re_registers *regs = RMATCH_REGS(match);
10371
10372         pos = BEG(0);
10373         sep = rb_str_subseq(str, pos, END(0) - pos);
10374     }
10375     else {
10376         pos = rb_str_index(str, sep, 0);
10377         if (pos < 0) goto failed;
10378     }
10379     return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
10380                           sep,
10381                           rb_str_subseq(str, pos+RSTRING_LEN(sep),
10382                                              RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
10383
10384   failed:
10385     return rb_ary_new3(3, str_duplicate(rb_cString, str), str_new_empty_String(str), str_new_empty_String(str));
10386 }
10387
10388 /*
10389  *  call-seq:
10390  *     str.rpartition(sep)             -> [head, sep, tail]
10391  *     str.rpartition(regexp)          -> [head, match, tail]
10392  *
10393  *  Searches <i>sep</i> or pattern (<i>regexp</i>) in the string from the end
10394  *  of the string, and returns the part before it, the match, and the part
10395  *  after it.
10396  *  If it is not found, returns two empty strings and <i>str</i>.
10397  *
10398  *     "hello".rpartition("l")         #=> ["hel", "l", "o"]
10399  *     "hello".rpartition("x")         #=> ["", "", "hello"]
10400  *     "hello".rpartition(/.l/)        #=> ["he", "ll", "o"]
10401  *
10402  *  The match from the end means starting at the possible last position, not
10403  *  the last of longest matches.
10404  *
10405  *     "hello".rpartition(/l+/)        #=> ["hel", "l", "o"]
10406  *
10407  *  To partition at the last longest match, needs to combine with
10408  *  negative lookbehind.
10409  *
10410  *     "hello".rpartition(/(?<!l)l+/)  #=> ["he", "ll", "o"]
10411  *
10412  *  Or String#partition with negative lookforward.
10413  *
10414  *     "hello".partition(/l+(?!.*l)/)  #=> ["he", "ll", "o"]
10415  */
10416
10417 static VALUE
10418 rb_str_rpartition(VALUE str, VALUE sep)
10419 {
10420     long pos = RSTRING_LEN(str);
10421
10422     sep = get_pat_quoted(sep, 0);
10423     if (RB_TYPE_P(sep, T_REGEXP)) {
10424         if (rb_reg_search(sep, str, pos, 1) < 0) {
10425             goto failed;
10426         }
10427         VALUE match = rb_backref_get();
10428         struct re_registers *regs = RMATCH_REGS(match);
10429
10430         pos = BEG(0);
10431         sep = rb_str_subseq(str, pos, END(0) - pos);
10432     }
10433     else {
10434         pos = rb_str_sublen(str, pos);
10435         pos = rb_str_rindex(str, sep, pos);
10436         if (pos < 0) {
10437             goto failed;
10438         }
10439         pos = rb_str_offset(str, pos);
10440     }
10441
10442     return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
10443                           sep,
10444                           rb_str_subseq(str, pos+RSTRING_LEN(sep),
10445                                         RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
10446   failed:
10447     return rb_ary_new3(3, str_new_empty_String(str), str_new_empty_String(str), str_duplicate(rb_cString, str));
10448 }
10449
10450 /*
10451  *  call-seq:
10452  *     str.start_with?([prefixes]+)   -> true or false
10453  *
10454  *  Returns true if +str+ starts with one of the +prefixes+ given.
10455  *  Each of the +prefixes+ should be a String or a Regexp.
10456  *
10457  *    "hello".start_with?("hell")               #=> true
10458  *    "hello".start_with?(/H/i)                 #=> true
10459  *
10460  *    # returns true if one of the prefixes matches.
10461  *    "hello".start_with?("heaven", "hell")     #=> true
10462  *    "hello".start_with?("heaven", "paradise") #=> false
10463  */
10464
10465 static VALUE
10466 rb_str_start_with(int argc, VALUE *argv, VALUE str)
10467 {
10468     int i;
10469
10470     for (i=0; i<argc; i++) {
10471         VALUE tmp = argv[i];
10472         if (RB_TYPE_P(tmp, T_REGEXP)) {
10473             if (rb_reg_start_with_p(tmp, str))
10474                 return Qtrue;
10475         }
10476         else {
10477             StringValue(tmp);
10478             rb_enc_check(str, tmp);
10479             if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
10480             if (memcmp(RSTRING_PTR(str), RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
10481                 return Qtrue;
10482         }
10483     }
10484     return Qfalse;
10485 }
10486
10487 /*
10488  *  call-seq:
10489  *     str.end_with?([suffixes]+)   -> true or false
10490  *
10491  *  Returns true if +str+ ends with one of the +suffixes+ given.
10492  *
10493  *    "hello".end_with?("ello")               #=> true
10494  *
10495  *    # returns true if one of the +suffixes+ matches.
10496  *    "hello".end_with?("heaven", "ello")     #=> true
10497  *    "hello".end_with?("heaven", "paradise") #=> false
10498  */
10499
10500 static VALUE
10501 rb_str_end_with(int argc, VALUE *argv, VALUE str)
10502 {
10503     int i;
10504     char *p, *s, *e;
10505     rb_encoding *enc;
10506
10507     for (i=0; i<argc; i++) {
10508         VALUE tmp = argv[i];
10509         long slen, tlen;
10510         StringValue(tmp);
10511         enc = rb_enc_check(str, tmp);
10512         if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
10513         if ((slen = RSTRING_LEN(str)) < tlen) continue;
10514         p = RSTRING_PTR(str);
10515         e = p + slen;
10516         s = e - tlen;
10517         if (rb_enc_left_char_head(p, s, e, enc) != s)
10518             continue;
10519         if (memcmp(s, RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
10520             return Qtrue;
10521     }
10522     return Qfalse;
10523 }
10524
10525 /*!
10526  * Returns the length of the <i>prefix</i> to be deleted in the given <i>str</i>,
10527  * returning 0 if <i>str</i> does not start with the <i>prefix</i>.
10528  *
10529  * @param str the target
10530  * @param prefix the prefix
10531  * @retval 0 if the given <i>str</i> does not start with the given <i>prefix</i>
10532  * @retval Positive-Integer otherwise
10533  */
10534 static long
10535 deleted_prefix_length(VALUE str, VALUE prefix)
10536 {
10537     char *strptr, *prefixptr;
10538     long olen, prefixlen;
10539
10540     StringValue(prefix);
10541     if (is_broken_string(prefix)) return 0;
10542     rb_enc_check(str, prefix);
10543
10544     /* return 0 if not start with prefix */
10545     prefixlen = RSTRING_LEN(prefix);
10546     if (prefixlen <= 0) return 0;
10547     olen = RSTRING_LEN(str);
10548     if (olen < prefixlen) return 0;
10549     strptr = RSTRING_PTR(str);
10550     prefixptr = RSTRING_PTR(prefix);
10551     if (memcmp(strptr, prefixptr, prefixlen) != 0) return 0;
10552
10553     return prefixlen;
10554 }
10555
10556 /*
10557  *  call-seq:
10558  *     str.delete_prefix!(prefix) -> self or nil
10559  *
10560  *  Deletes leading <code>prefix</code> from <i>str</i>, returning
10561  *  <code>nil</code> if no change was made.
10562  *
10563  *     "hello".delete_prefix!("hel") #=> "lo"
10564  *     "hello".delete_prefix!("llo") #=> nil
10565  */
10566
10567 static VALUE
10568 rb_str_delete_prefix_bang(VALUE str, VALUE prefix)
10569 {
10570     long prefixlen;
10571     str_modify_keep_cr(str);
10572
10573     prefixlen = deleted_prefix_length(str, prefix);
10574     if (prefixlen <= 0) return Qnil;
10575
10576     return rb_str_drop_bytes(str, prefixlen);
10577 }
10578
10579 /*
10580  *  call-seq:
10581  *     str.delete_prefix(prefix) -> new_str
10582  *
10583  *  Returns a copy of <i>str</i> with leading <code>prefix</code> deleted.
10584  *
10585  *     "hello".delete_prefix("hel") #=> "lo"
10586  *     "hello".delete_prefix("llo") #=> "hello"
10587  */
10588
10589 static VALUE
10590 rb_str_delete_prefix(VALUE str, VALUE prefix)
10591 {
10592     long prefixlen;
10593
10594     prefixlen = deleted_prefix_length(str, prefix);
10595     if (prefixlen <= 0) return str_duplicate(rb_cString, str);
10596
10597     return rb_str_subseq(str, prefixlen, RSTRING_LEN(str) - prefixlen);
10598 }
10599
10600 /*!
10601  * Returns the length of the <i>suffix</i> to be deleted in the given <i>str</i>,
10602  * returning 0 if <i>str</i> does not end with the <i>suffix</i>.
10603  *
10604  * @param str the target
10605  * @param suffix the suffix
10606  * @retval 0 if the given <i>str</i> does not end with the given <i>suffix</i>
10607  * @retval Positive-Integer otherwise
10608  */
10609 static long
10610 deleted_suffix_length(VALUE str, VALUE suffix)
10611 {
10612     char *strptr, *suffixptr, *s;
10613     long olen, suffixlen;
10614     rb_encoding *enc;
10615
10616     StringValue(suffix);
10617     if (is_broken_string(suffix)) return 0;
10618     enc = rb_enc_check(str, suffix);
10619
10620     /* return 0 if not start with suffix */
10621     suffixlen = RSTRING_LEN(suffix);
10622     if (suffixlen <= 0) return 0;
10623     olen = RSTRING_LEN(str);
10624     if (olen < suffixlen) return 0;
10625     strptr = RSTRING_PTR(str);
10626     suffixptr = RSTRING_PTR(suffix);
10627     s = strptr + olen - suffixlen;
10628     if (memcmp(s, suffixptr, suffixlen) != 0) return 0;
10629     if (rb_enc_left_char_head(strptr, s, strptr + olen, enc) != s) return 0;
10630
10631     return suffixlen;
10632 }
10633
10634 /*
10635  *  call-seq:
10636  *     str.delete_suffix!(suffix) -> self or nil
10637  *
10638  *  Deletes trailing <code>suffix</code> from <i>str</i>, returning
10639  *  <code>nil</code> if no change was made.
10640  *
10641  *     "hello".delete_suffix!("llo") #=> "he"
10642  *     "hello".delete_suffix!("hel") #=> nil
10643  */
10644
10645 static VALUE
10646 rb_str_delete_suffix_bang(VALUE str, VALUE suffix)
10647 {
10648     long olen, suffixlen, len;
10649     str_modifiable(str);
10650
10651     suffixlen = deleted_suffix_length(str, suffix);
10652     if (suffixlen <= 0) return Qnil;
10653
10654     olen = RSTRING_LEN(str);
10655     str_modify_keep_cr(str);
10656     len = olen - suffixlen;
10657     STR_SET_LEN(str, len);
10658     TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
10659     if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
10660         ENC_CODERANGE_CLEAR(str);
10661     }
10662     return str;
10663 }
10664
10665 /*
10666  *  call-seq:
10667  *     str.delete_suffix(suffix) -> new_str
10668  *
10669  *  Returns a copy of <i>str</i> with trailing <code>suffix</code> deleted.
10670  *
10671  *     "hello".delete_suffix("llo") #=> "he"
10672  *     "hello".delete_suffix("hel") #=> "hello"
10673  */
10674
10675 static VALUE
10676 rb_str_delete_suffix(VALUE str, VALUE suffix)
10677 {
10678     long suffixlen;
10679
10680     suffixlen = deleted_suffix_length(str, suffix);
10681     if (suffixlen <= 0) return str_duplicate(rb_cString, str);
10682
10683     return rb_str_subseq(str, 0, RSTRING_LEN(str) - suffixlen);
10684 }
10685
10686 void
10687 rb_str_setter(VALUE val, ID id, VALUE *var)
10688 {
10689     if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING)) {
10690         rb_raise(rb_eTypeError, "value of %"PRIsVALUE" must be String", rb_id2str(id));
10691     }
10692     *var = val;
10693 }
10694
10695 static void
10696 rb_fs_setter(VALUE val, ID id, VALUE *var)
10697 {
10698     val = rb_fs_check(val);
10699     if (!val) {
10700         rb_raise(rb_eTypeError,
10701                  "value of %"PRIsVALUE" must be String or Regexp",
10702                  rb_id2str(id));
10703     }
10704     if (!NIL_P(val)) {
10705         rb_warn_deprecated("`$;'", NULL);
10706     }
10707     *var = val;
10708 }
10709
10710
10711 /*
10712  *  call-seq:
10713  *     str.force_encoding(encoding)   -> str
10714  *
10715  *  Changes the encoding to +encoding+ and returns self.
10716  */
10717
10718 static VALUE
10719 rb_str_force_encoding(VALUE str, VALUE enc)
10720 {
10721     str_modifiable(str);
10722     rb_enc_associate(str, rb_to_encoding(enc));
10723     ENC_CODERANGE_CLEAR(str);
10724     return str;
10725 }
10726
10727 /*
10728  *  call-seq:
10729  *     str.b   -> str
10730  *
10731  *  Returns a copied string whose encoding is ASCII-8BIT.
10732  */
10733
10734 static VALUE
10735 rb_str_b(VALUE str)
10736 {
10737     VALUE str2;
10738     if (FL_TEST(str, STR_NOEMBED)) {
10739         str2 = str_alloc_heap(rb_cString);
10740     }
10741     else {
10742         str2 = str_alloc_embed(rb_cString, RSTRING_EMBED_LEN(str) + TERM_LEN(str));
10743     }
10744     str_replace_shared_without_enc(str2, str);
10745     ENC_CODERANGE_CLEAR(str2);
10746     return str2;
10747 }
10748
10749 /*
10750  *  call-seq:
10751  *     str.valid_encoding?  -> true or false
10752  *
10753  *  Returns true for a string which is encoded correctly.
10754  *
10755  *    "\xc2\xa1".force_encoding("UTF-8").valid_encoding?  #=> true
10756  *    "\xc2".force_encoding("UTF-8").valid_encoding?      #=> false
10757  *    "\x80".force_encoding("UTF-8").valid_encoding?      #=> false
10758  */
10759
10760 static VALUE
10761 rb_str_valid_encoding_p(VALUE str)
10762 {
10763     int cr = rb_enc_str_coderange(str);
10764
10765     return RBOOL(cr != ENC_CODERANGE_BROKEN);
10766 }
10767
10768 /*
10769  *  call-seq:
10770  *     str.ascii_only?  -> true or false
10771  *
10772  *  Returns true for a string which has only ASCII characters.
10773  *
10774  *    "abc".force_encoding("UTF-8").ascii_only?          #=> true
10775  *    "abc\u{6666}".force_encoding("UTF-8").ascii_only?  #=> false
10776  */
10777
10778 static VALUE
10779 rb_str_is_ascii_only_p(VALUE str)
10780 {
10781     int cr = rb_enc_str_coderange(str);
10782
10783     return RBOOL(cr == ENC_CODERANGE_7BIT);
10784 }
10785
10786 VALUE
10787 rb_str_ellipsize(VALUE str, long len)
10788 {
10789     static const char ellipsis[] = "...";
10790     const long ellipsislen = sizeof(ellipsis) - 1;
10791     rb_encoding *const enc = rb_enc_get(str);
10792     const long blen = RSTRING_LEN(str);
10793     const char *const p = RSTRING_PTR(str), *e = p + blen;
10794     VALUE estr, ret = 0;
10795
10796     if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
10797     if (len * rb_enc_mbminlen(enc) >= blen ||
10798         (e = rb_enc_nth(p, e, len, enc)) - p == blen) {
10799         ret = str;
10800     }
10801     else if (len <= ellipsislen ||
10802              !(e = rb_enc_step_back(p, e, e, len = ellipsislen, enc))) {
10803         if (rb_enc_asciicompat(enc)) {
10804             ret = rb_str_new(ellipsis, len);
10805             rb_enc_associate(ret, enc);
10806         }
10807         else {
10808             estr = rb_usascii_str_new(ellipsis, len);
10809             ret = rb_str_encode(estr, rb_enc_from_encoding(enc), 0, Qnil);
10810         }
10811     }
10812     else if (ret = rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
10813         rb_str_cat(ret, ellipsis, ellipsislen);
10814     }
10815     else {
10816         estr = rb_str_encode(rb_usascii_str_new(ellipsis, ellipsislen),
10817                              rb_enc_from_encoding(enc), 0, Qnil);
10818         rb_str_append(ret, estr);
10819     }
10820     return ret;
10821 }
10822
10823 static VALUE
10824 str_compat_and_valid(VALUE str, rb_encoding *enc)
10825 {
10826     int cr;
10827     str = StringValue(str);
10828     cr = rb_enc_str_coderange(str);
10829     if (cr == ENC_CODERANGE_BROKEN) {
10830         rb_raise(rb_eArgError, "replacement must be valid byte sequence '%+"PRIsVALUE"'", str);
10831     }
10832     else {
10833         rb_encoding *e = STR_ENC_GET(str);
10834         if (cr == ENC_CODERANGE_7BIT ? rb_enc_mbminlen(enc) != 1 : enc != e) {
10835             rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
10836                      rb_enc_name(enc), rb_enc_name(e));
10837         }
10838     }
10839     return str;
10840 }
10841
10842 static VALUE enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr);
10843
10844 VALUE
10845 rb_str_scrub(VALUE str, VALUE repl)
10846 {
10847     rb_encoding *enc = STR_ENC_GET(str);
10848     return enc_str_scrub(enc, str, repl, ENC_CODERANGE(str));
10849 }
10850
10851 VALUE
10852 rb_enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl)
10853 {
10854     int cr = ENC_CODERANGE_UNKNOWN;
10855     if (enc == STR_ENC_GET(str)) {
10856         /* cached coderange makes sense only when enc equals the
10857          * actual encoding of str */
10858         cr = ENC_CODERANGE(str);
10859     }
10860     return enc_str_scrub(enc, str, repl, cr);
10861 }
10862
10863 static VALUE
10864 enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr)
10865 {
10866     int encidx;
10867     VALUE buf = Qnil;
10868     const char *rep, *p, *e, *p1, *sp;
10869     long replen = -1;
10870     long slen;
10871
10872     if (rb_block_given_p()) {
10873         if (!NIL_P(repl))
10874             rb_raise(rb_eArgError, "both of block and replacement given");
10875         replen = 0;
10876     }
10877
10878     if (ENC_CODERANGE_CLEAN_P(cr))
10879         return Qnil;
10880
10881     if (!NIL_P(repl)) {
10882         repl = str_compat_and_valid(repl, enc);
10883     }
10884
10885     if (rb_enc_dummy_p(enc)) {
10886         return Qnil;
10887     }
10888     encidx = rb_enc_to_index(enc);
10889
10890 #define DEFAULT_REPLACE_CHAR(str) do { \
10891         static const char replace[sizeof(str)-1] = str; \
10892         rep = replace; replen = (int)sizeof(replace); \
10893     } while (0)
10894
10895     slen = RSTRING_LEN(str);
10896     p = RSTRING_PTR(str);
10897     e = RSTRING_END(str);
10898     p1 = p;
10899     sp = p;
10900
10901     if (rb_enc_asciicompat(enc)) {
10902         int rep7bit_p;
10903         if (!replen) {
10904             rep = NULL;
10905             rep7bit_p = FALSE;
10906         }
10907         else if (!NIL_P(repl)) {
10908             rep = RSTRING_PTR(repl);
10909             replen = RSTRING_LEN(repl);
10910             rep7bit_p = (ENC_CODERANGE(repl) == ENC_CODERANGE_7BIT);
10911         }
10912         else if (encidx == rb_utf8_encindex()) {
10913             DEFAULT_REPLACE_CHAR("\xEF\xBF\xBD");
10914             rep7bit_p = FALSE;
10915         }
10916         else {
10917             DEFAULT_REPLACE_CHAR("?");
10918             rep7bit_p = TRUE;
10919         }
10920         cr = ENC_CODERANGE_7BIT;
10921
10922         p = search_nonascii(p, e);
10923         if (!p) {
10924             p = e;
10925         }
10926         while (p < e) {
10927             int ret = rb_enc_precise_mbclen(p, e, enc);
10928             if (MBCLEN_NEEDMORE_P(ret)) {
10929                 break;
10930             }
10931             else if (MBCLEN_CHARFOUND_P(ret)) {
10932                 cr = ENC_CODERANGE_VALID;
10933                 p += MBCLEN_CHARFOUND_LEN(ret);
10934             }
10935             else if (MBCLEN_INVALID_P(ret)) {
10936                 /*
10937                  * p1~p: valid ascii/multibyte chars
10938                  * p ~e: invalid bytes + unknown bytes
10939                  */
10940                 long clen = rb_enc_mbmaxlen(enc);
10941                 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
10942                 if (p > p1) {
10943                     rb_str_buf_cat(buf, p1, p - p1);
10944                 }
10945
10946                 if (e - p < clen) clen = e - p;
10947                 if (clen <= 2) {
10948                     clen = 1;
10949                 }
10950                 else {
10951                     const char *q = p;
10952                     clen--;
10953                     for (; clen > 1; clen--) {
10954                         ret = rb_enc_precise_mbclen(q, q + clen, enc);
10955                         if (MBCLEN_NEEDMORE_P(ret)) break;
10956                         if (MBCLEN_INVALID_P(ret)) continue;
10957                         UNREACHABLE;
10958                     }
10959                 }
10960                 if (rep) {
10961                     rb_str_buf_cat(buf, rep, replen);
10962                     if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
10963                 }
10964                 else {
10965                     repl = rb_yield(rb_enc_str_new(p, clen, enc));
10966                     str_mod_check(str, sp, slen);
10967                     repl = str_compat_and_valid(repl, enc);
10968                     rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
10969                     if (ENC_CODERANGE(repl) == ENC_CODERANGE_VALID)
10970                         cr = ENC_CODERANGE_VALID;
10971                 }
10972                 p += clen;
10973                 p1 = p;
10974                 p = search_nonascii(p, e);
10975                 if (!p) {
10976                     p = e;
10977                     break;
10978                 }
10979             }
10980             else {
10981                 UNREACHABLE;
10982             }
10983         }
10984         if (NIL_P(buf)) {
10985             if (p == e) {
10986                 ENC_CODERANGE_SET(str, cr);
10987                 return Qnil;
10988             }
10989             buf = rb_str_buf_new(RSTRING_LEN(str));
10990         }
10991         if (p1 < p) {
10992             rb_str_buf_cat(buf, p1, p - p1);
10993         }
10994         if (p < e) {
10995             if (rep) {
10996                 rb_str_buf_cat(buf, rep, replen);
10997                 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
10998             }
10999             else {
11000                 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11001                 str_mod_check(str, sp, slen);
11002                 repl = str_compat_and_valid(repl, enc);
11003                 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11004                 if (ENC_CODERANGE(repl) == ENC_CODERANGE_VALID)
11005                     cr = ENC_CODERANGE_VALID;
11006             }
11007         }
11008     }
11009     else {
11010         /* ASCII incompatible */
11011         long mbminlen = rb_enc_mbminlen(enc);
11012         if (!replen) {
11013             rep = NULL;
11014         }
11015         else if (!NIL_P(repl)) {
11016             rep = RSTRING_PTR(repl);
11017             replen = RSTRING_LEN(repl);
11018         }
11019         else if (encidx == ENCINDEX_UTF_16BE) {
11020             DEFAULT_REPLACE_CHAR("\xFF\xFD");
11021         }
11022         else if (encidx == ENCINDEX_UTF_16LE) {
11023             DEFAULT_REPLACE_CHAR("\xFD\xFF");
11024         }
11025         else if (encidx == ENCINDEX_UTF_32BE) {
11026             DEFAULT_REPLACE_CHAR("\x00\x00\xFF\xFD");
11027         }
11028         else if (encidx == ENCINDEX_UTF_32LE) {
11029             DEFAULT_REPLACE_CHAR("\xFD\xFF\x00\x00");
11030         }
11031         else {
11032             DEFAULT_REPLACE_CHAR("?");
11033         }
11034
11035         while (p < e) {
11036             int ret = rb_enc_precise_mbclen(p, e, enc);
11037             if (MBCLEN_NEEDMORE_P(ret)) {
11038                 break;
11039             }
11040             else if (MBCLEN_CHARFOUND_P(ret)) {
11041                 p += MBCLEN_CHARFOUND_LEN(ret);
11042             }
11043             else if (MBCLEN_INVALID_P(ret)) {
11044                 const char *q = p;
11045                 long clen = rb_enc_mbmaxlen(enc);
11046                 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11047                 if (p > p1) rb_str_buf_cat(buf, p1, p - p1);
11048
11049                 if (e - p < clen) clen = e - p;
11050                 if (clen <= mbminlen * 2) {
11051                     clen = mbminlen;
11052                 }
11053                 else {
11054                     clen -= mbminlen;
11055                     for (; clen > mbminlen; clen-=mbminlen) {
11056                         ret = rb_enc_precise_mbclen(q, q + clen, enc);
11057                         if (MBCLEN_NEEDMORE_P(ret)) break;
11058                         if (MBCLEN_INVALID_P(ret)) continue;
11059                         UNREACHABLE;
11060                     }
11061                 }
11062                 if (rep) {
11063                     rb_str_buf_cat(buf, rep, replen);
11064                 }
11065                 else {
11066                     repl = rb_yield(rb_enc_str_new(p, clen, enc));
11067                     str_mod_check(str, sp, slen);
11068                     repl = str_compat_and_valid(repl, enc);
11069                     rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11070                 }
11071                 p += clen;
11072                 p1 = p;
11073             }
11074             else {
11075                 UNREACHABLE;
11076             }
11077         }
11078         if (NIL_P(buf)) {
11079             if (p == e) {
11080                 ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
11081                 return Qnil;
11082             }
11083             buf = rb_str_buf_new(RSTRING_LEN(str));
11084         }
11085         if (p1 < p) {
11086             rb_str_buf_cat(buf, p1, p - p1);
11087         }
11088         if (p < e) {
11089             if (rep) {
11090                 rb_str_buf_cat(buf, rep, replen);
11091             }
11092             else {
11093                 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11094                 str_mod_check(str, sp, slen);
11095                 repl = str_compat_and_valid(repl, enc);
11096                 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11097             }
11098         }
11099         cr = ENC_CODERANGE_VALID;
11100     }
11101     ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), cr);
11102     return buf;
11103 }
11104
11105 /*
11106  *  call-seq:
11107  *    str.scrub -> new_str
11108  *    str.scrub(repl) -> new_str
11109  *    str.scrub{|bytes|} -> new_str
11110  *
11111  *  If the string is invalid byte sequence then replace invalid bytes with given replacement
11112  *  character, else returns self.
11113  *  If block is given, replace invalid bytes with returned value of the block.
11114  *
11115  *     "abc\u3042\x81".scrub #=> "abc\u3042\uFFFD"
11116  *     "abc\u3042\x81".scrub("*") #=> "abc\u3042*"
11117  *     "abc\u3042\xE3\x80".scrub{|bytes| '<'+bytes.unpack1('H*')+'>' } #=> "abc\u3042<e380>"
11118  */
11119 static VALUE
11120 str_scrub(int argc, VALUE *argv, VALUE str)
11121 {
11122     VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
11123     VALUE new = rb_str_scrub(str, repl);
11124     return NIL_P(new) ? str_duplicate(rb_cString, str): new;
11125 }
11126
11127 /*
11128  *  call-seq:
11129  *    str.scrub! -> str
11130  *    str.scrub!(repl) -> str
11131  *    str.scrub!{|bytes|} -> str
11132  *
11133  *  If the string is invalid byte sequence then replace invalid bytes with given replacement
11134  *  character, else returns self.
11135  *  If block is given, replace invalid bytes with returned value of the block.
11136  *
11137  *     "abc\u3042\x81".scrub! #=> "abc\u3042\uFFFD"
11138  *     "abc\u3042\x81".scrub!("*") #=> "abc\u3042*"
11139  *     "abc\u3042\xE3\x80".scrub!{|bytes| '<'+bytes.unpack1('H*')+'>' } #=> "abc\u3042<e380>"
11140  */
11141 static VALUE
11142 str_scrub_bang(int argc, VALUE *argv, VALUE str)
11143 {
11144     VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
11145     VALUE new = rb_str_scrub(str, repl);
11146     if (!NIL_P(new)) rb_str_replace(str, new);
11147     return str;
11148 }
11149
11150 static ID id_normalize;
11151 static ID id_normalized_p;
11152 static VALUE mUnicodeNormalize;
11153
11154 static VALUE
11155 unicode_normalize_common(int argc, VALUE *argv, VALUE str, ID id)
11156 {
11157     static int UnicodeNormalizeRequired = 0;
11158     VALUE argv2[2];
11159
11160     if (!UnicodeNormalizeRequired) {
11161         rb_require("unicode_normalize/normalize.rb");
11162         UnicodeNormalizeRequired = 1;
11163     }
11164     argv2[0] = str;
11165     if (rb_check_arity(argc, 0, 1)) argv2[1] = argv[0];
11166     return rb_funcallv(mUnicodeNormalize, id, argc+1, argv2);
11167 }
11168
11169 /*
11170  *  call-seq:
11171  *    str.unicode_normalize(form=:nfc)
11172  *
11173  *  Unicode Normalization---Returns a normalized form of +str+,
11174  *  using Unicode normalizations NFC, NFD, NFKC, or NFKD.
11175  *  The normalization form used is determined by +form+, which can
11176  *  be any of the four values +:nfc+, +:nfd+, +:nfkc+, or +:nfkd+.
11177  *  The default is +:nfc+.
11178  *
11179  *  If the string is not in a Unicode Encoding, then an Exception is raised.
11180  *  In this context, 'Unicode Encoding' means any of UTF-8, UTF-16BE/LE,
11181  *  and UTF-32BE/LE, as well as GB18030, UCS_2BE, and UCS_4BE.
11182  *  Anything other than UTF-8 is implemented by converting to UTF-8,
11183  *  which makes it slower than UTF-8.
11184  *
11185  *    "a\u0300".unicode_normalize        #=> "\u00E0"
11186  *    "a\u0300".unicode_normalize(:nfc)  #=> "\u00E0"
11187  *    "\u00E0".unicode_normalize(:nfd)   #=> "a\u0300"
11188  *    "\xE0".force_encoding('ISO-8859-1').unicode_normalize(:nfd)
11189  *                                       #=> Encoding::CompatibilityError raised
11190  */
11191 static VALUE
11192 rb_str_unicode_normalize(int argc, VALUE *argv, VALUE str)
11193 {
11194     return unicode_normalize_common(argc, argv, str, id_normalize);
11195 }
11196
11197 /*
11198  *  call-seq:
11199  *    str.unicode_normalize!(form=:nfc)
11200  *
11201  *  Destructive version of String#unicode_normalize, doing Unicode
11202  *  normalization in place.
11203  */
11204 static VALUE
11205 rb_str_unicode_normalize_bang(int argc, VALUE *argv, VALUE str)
11206 {
11207     return rb_str_replace(str, unicode_normalize_common(argc, argv, str, id_normalize));
11208 }
11209
11210 /*  call-seq:
11211  *    str.unicode_normalized?(form=:nfc)
11212  *
11213  *  Checks whether +str+ is in Unicode normalization form +form+,
11214  *  which can be any of the four values +:nfc+, +:nfd+, +:nfkc+, or +:nfkd+.
11215  *  The default is +:nfc+.
11216  *
11217  *  If the string is not in a Unicode Encoding, then an Exception is raised.
11218  *  For details, see String#unicode_normalize.
11219  *
11220  *    "a\u0300".unicode_normalized?        #=> false
11221  *    "a\u0300".unicode_normalized?(:nfd)  #=> true
11222  *    "\u00E0".unicode_normalized?         #=> true
11223  *    "\u00E0".unicode_normalized?(:nfd)   #=> false
11224  *    "\xE0".force_encoding('ISO-8859-1').unicode_normalized?
11225  *                                         #=> Encoding::CompatibilityError raised
11226  */
11227 static VALUE
11228 rb_str_unicode_normalized_p(int argc, VALUE *argv, VALUE str)
11229 {
11230     return unicode_normalize_common(argc, argv, str, id_normalized_p);
11231 }
11232
11233 /**********************************************************************
11234  * Document-class: Symbol
11235  *
11236  * Symbol objects represent named identifiers inside the Ruby interpreter.
11237  *
11238  * You can create a \Symbol object explicitly with:
11239  *
11240  * - A {symbol literal}[doc/syntax/literals_rdoc.html#label-Symbol+Literals].
11241  *
11242  * The same Symbol object will be
11243  * created for a given name or string for the duration of a program's
11244  * execution, regardless of the context or meaning of that name. Thus
11245  * if <code>Fred</code> is a constant in one context, a method in
11246  * another, and a class in a third, the Symbol <code>:Fred</code>
11247  * will be the same object in all three contexts.
11248  *
11249  *     module One
11250  *       class Fred
11251  *       end
11252  *       $f1 = :Fred
11253  *     end
11254  *     module Two
11255  *       Fred = 1
11256  *       $f2 = :Fred
11257  *     end
11258  *     def Fred()
11259  *     end
11260  *     $f3 = :Fred
11261  *     $f1.object_id   #=> 2514190
11262  *     $f2.object_id   #=> 2514190
11263  *     $f3.object_id   #=> 2514190
11264  *
11265  * Constant, method, and variable names are returned as symbols:
11266  *
11267  *     module One
11268  *       Two = 2
11269  *       def three; 3 end
11270  *       @four = 4
11271  *       @@five = 5
11272  *       $six = 6
11273  *     end
11274  *     seven = 7
11275  *
11276  *     One.constants
11277  *     # => [:Two]
11278  *     One.instance_methods(true)
11279  *     # => [:three]
11280  *     One.instance_variables
11281  *     # => [:@four]
11282  *     One.class_variables
11283  *     # => [:@@five]
11284  *     global_variables.grep(/six/)
11285  *     # => [:$six]
11286  *     local_variables
11287  *     # => [:seven]
11288  *
11289  * Symbol objects are different from String objects in that
11290  * Symbol objects represent identifiers, while String objects
11291  * represent text or data.
11292  *
11293  * == What's Here
11294  *
11295  * First, what's elsewhere. \Class \Symbol:
11296  *
11297  * - Inherits from {class Object}[Object.html#class-Object-label-What-27s+Here].
11298  * - Includes {module Comparable}[Comparable.html#module-Comparable-label-What-27s+Here].
11299  *
11300  * Here, class \Symbol provides methods that are useful for:
11301  *
11302  * - {Querying}[#class-Symbol-label-Methods+for+Querying]
11303  * - {Comparing}[#class-Symbol-label-Methods+for+Comparing]
11304  * - {Converting}[#class-Symbol-label-Methods+for+Converting]
11305  *
11306  * === Methods for Querying
11307  *
11308  * - ::all_symbols:: Returns an array of the symbols currently in Ruby's symbol table.
11309  * - {#=~}[#method-i-3D~]:: Returns the index of the first substring
11310  *                          in symbol that matches a given Regexp
11311  *                          or other object; returns +nil+ if no match is found.
11312  * - #[], #slice :: Returns a substring of symbol
11313  *                  determined by a given index, start/length, or range, or string.
11314  * - #empty?:: Returns +true+ if +self.length+ is zero; +false+ otherwise.
11315  * - #encoding:: Returns the Encoding object that represents the encoding
11316  *               of symbol.
11317  * - #end_with?:: Returns +true+ if symbol ends with
11318  *                any of the given strings.
11319  * - #match:: Returns a MatchData object if symbol
11320  *            matches a given Regexp; +nil+ otherwise.
11321  * - #match?:: Returns +true+ if symbol
11322  *             matches a given Regexp; +false+ otherwise.
11323  * - #length, #size:: Returns the number of characters in symbol.
11324  * - #start_with?:: Returns +true+ if symbol starts with
11325  *                  any of the given strings.
11326  *
11327  * === Methods for Comparing
11328  *
11329  * - {#<=>}[#method-i-3C-3D-3E]:: Returns -1, 0, or 1 as a given symbol is smaller than, equal to, or larger than symbol.
11330  * - {#==, #===}[#method-i-3D-3D]:: Returns +true+ if a given symbol
11331  *                                  has the same content and encoding.
11332  * - #casecmp:: Ignoring case, returns -1, 0, or 1 as a given
11333  *              symbol is smaller than, equal to, or larger than symbol.
11334  * - #casecmp?:: Returns +true+ if symbol is equal to a given symbol
11335  *               after Unicode case folding; +false+ otherwise.
11336  *
11337  * === Methods for Converting
11338  *
11339  * - #capitalize:: Returns symbol with the first character upcased
11340  *                 and all other characters downcased.
11341  * - #downcase:: Returns symbol with all characters downcased.
11342  * - #inspect:: Returns the string representation of +self+ as a symbol literal.
11343  * - #name:: Returns the frozen string corresponding to symbol.
11344  * - #succ, #next:: Returns the symbol that is the successor to symbol.
11345  * - #swapcase:: Returns symbol with all upcase characters downcased
11346  *               and all downcase characters upcased.
11347  * - #to_proc:: Returns a Proc object which responds to the method named by symbol.
11348  * - #to_s, #id2name:: Returns the string corresponding to +self+.
11349  * - #to_sym, #intern:: Returns +self+.
11350  * - #upcase:: Returns symbol with all characters upcased.
11351  *
11352  */
11353
11354
11355 /*
11356  *  call-seq:
11357  *     sym == obj   -> true or false
11358  *
11359  *  Equality---If <i>sym</i> and <i>obj</i> are exactly the same
11360  *  symbol, returns <code>true</code>.
11361  */
11362
11363 #define sym_equal rb_obj_equal
11364
11365 static int
11366 sym_printable(const char *s, const char *send, rb_encoding *enc)
11367 {
11368     while (s < send) {
11369         int n;
11370         int c = rb_enc_precise_mbclen(s, send, enc);
11371
11372         if (!MBCLEN_CHARFOUND_P(c)) return FALSE;
11373         n = MBCLEN_CHARFOUND_LEN(c);
11374         c = rb_enc_mbc_to_codepoint(s, send, enc);
11375         if (!rb_enc_isprint(c, enc)) return FALSE;
11376         s += n;
11377     }
11378     return TRUE;
11379 }
11380
11381 int
11382 rb_str_symname_p(VALUE sym)
11383 {
11384     rb_encoding *enc;
11385     const char *ptr;
11386     long len;
11387     rb_encoding *resenc = rb_default_internal_encoding();
11388
11389     if (resenc == NULL) resenc = rb_default_external_encoding();
11390     enc = STR_ENC_GET(sym);
11391     ptr = RSTRING_PTR(sym);
11392     len = RSTRING_LEN(sym);
11393     if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) ||
11394         !rb_enc_symname2_p(ptr, len, enc) || !sym_printable(ptr, ptr + len, enc)) {
11395         return FALSE;
11396     }
11397     return TRUE;
11398 }
11399
11400 VALUE
11401 rb_str_quote_unprintable(VALUE str)
11402 {
11403     rb_encoding *enc;
11404     const char *ptr;
11405     long len;
11406     rb_encoding *resenc;
11407
11408     Check_Type(str, T_STRING);
11409     resenc = rb_default_internal_encoding();
11410     if (resenc == NULL) resenc = rb_default_external_encoding();
11411     enc = STR_ENC_GET(str);
11412     ptr = RSTRING_PTR(str);
11413     len = RSTRING_LEN(str);
11414     if ((resenc != enc && !rb_str_is_ascii_only_p(str)) ||
11415         !sym_printable(ptr, ptr + len, enc)) {
11416         return rb_str_escape(str);
11417     }
11418     return str;
11419 }
11420
11421 MJIT_FUNC_EXPORTED VALUE
11422 rb_id_quote_unprintable(ID id)
11423 {
11424     VALUE str = rb_id2str(id);
11425     if (!rb_str_symname_p(str)) {
11426         return rb_str_escape(str);
11427     }
11428     return str;
11429 }
11430
11431 /*
11432  *  call-seq:
11433  *     sym.inspect    -> string
11434  *
11435  *  Returns the representation of <i>sym</i> as a symbol literal.
11436  *
11437  *     :fred.inspect   #=> ":fred"
11438  */
11439
11440 static VALUE
11441 sym_inspect(VALUE sym)
11442 {
11443     VALUE str = rb_sym2str(sym);
11444     const char *ptr;
11445     long len;
11446     char *dest;
11447
11448     if (!rb_str_symname_p(str)) {
11449         str = rb_str_inspect(str);
11450         len = RSTRING_LEN(str);
11451         rb_str_resize(str, len + 1);
11452         dest = RSTRING_PTR(str);
11453         memmove(dest + 1, dest, len);
11454     }
11455     else {
11456         rb_encoding *enc = STR_ENC_GET(str);
11457         RSTRING_GETMEM(str, ptr, len);
11458         str = rb_enc_str_new(0, len + 1, enc);
11459         dest = RSTRING_PTR(str);
11460         memcpy(dest + 1, ptr, len);
11461     }
11462     dest[0] = ':';
11463     return str;
11464 }
11465
11466 #if 0 /* for RDoc */
11467 /*
11468  *  call-seq:
11469  *     sym.name   -> string
11470  *
11471  *  Returns the name or string corresponding to <i>sym</i>. Unlike #to_s, the
11472  *  returned string is frozen.
11473  *
11474  *     :fred.name         #=> "fred"
11475  *     :fred.name.frozen? #=> true
11476  *     :fred.to_s         #=> "fred"
11477  *     :fred.to_s.frozen? #=> false
11478  */
11479 VALUE
11480 rb_sym2str(VALUE sym)
11481 {
11482
11483 }
11484 #endif
11485
11486
11487 /*
11488  *  call-seq:
11489  *     sym.id2name   -> string
11490  *     sym.to_s      -> string
11491  *
11492  *  Returns the name or string corresponding to <i>sym</i>.
11493  *
11494  *     :fred.id2name   #=> "fred"
11495  *     :ginger.to_s    #=> "ginger"
11496  *
11497  *  Note that this string is not frozen (unlike the symbol itself).
11498  *  To get a frozen string, use #name.
11499  */
11500
11501
11502 VALUE
11503 rb_sym_to_s(VALUE sym)
11504 {
11505     return str_new_shared(rb_cString, rb_sym2str(sym));
11506 }
11507
11508
11509 /*
11510  * call-seq:
11511  *   sym.to_sym   -> sym
11512  *   sym.intern   -> sym
11513  *
11514  * In general, <code>to_sym</code> returns the Symbol corresponding
11515  * to an object. As <i>sym</i> is already a symbol, <code>self</code> is returned
11516  * in this case.
11517  */
11518
11519 static VALUE
11520 sym_to_sym(VALUE sym)
11521 {
11522     return sym;
11523 }
11524
11525 MJIT_FUNC_EXPORTED VALUE
11526 rb_sym_proc_call(ID mid, int argc, const VALUE *argv, int kw_splat, VALUE passed_proc)
11527 {
11528     VALUE obj;
11529
11530     if (argc < 1) {
11531         rb_raise(rb_eArgError, "no receiver given");
11532     }
11533     obj = argv[0];
11534     return rb_funcall_with_block_kw(obj, mid, argc - 1, argv + 1, passed_proc, kw_splat);
11535 }
11536
11537 #if 0
11538 /*
11539  * call-seq:
11540  *   sym.to_proc
11541  *
11542  * Returns a _Proc_ object which responds to the given method by _sym_.
11543  *
11544  *   (1..3).collect(&:to_s)  #=> ["1", "2", "3"]
11545  */
11546
11547 VALUE
11548 rb_sym_to_proc(VALUE sym)
11549 {
11550 }
11551 #endif
11552
11553 /*
11554  * call-seq:
11555  *
11556  *   sym.succ
11557  *
11558  * Same as <code>sym.to_s.succ.intern</code>.
11559  */
11560
11561 static VALUE
11562 sym_succ(VALUE sym)
11563 {
11564     return rb_str_intern(rb_str_succ(rb_sym2str(sym)));
11565 }
11566
11567 /*
11568  * call-seq:
11569  *
11570  *   symbol <=> other_symbol       -> -1, 0, +1, or nil
11571  *
11572  * Compares +symbol+ with +other_symbol+ after calling #to_s on each of the
11573  * symbols. Returns -1, 0, +1, or +nil+ depending on whether +symbol+ is
11574  * less than, equal to, or greater than +other_symbol+.
11575  *
11576  * +nil+ is returned if the two values are incomparable.
11577  *
11578  * See String#<=> for more information.
11579  */
11580
11581 static VALUE
11582 sym_cmp(VALUE sym, VALUE other)
11583 {
11584     if (!SYMBOL_P(other)) {
11585         return Qnil;
11586     }
11587     return rb_str_cmp_m(rb_sym2str(sym), rb_sym2str(other));
11588 }
11589
11590 /*
11591  *  call-seq:
11592  *    casecmp(other_symbol) -> -1, 0, 1, or nil
11593  *
11594  *  Case-insensitive version of {Symbol#<=>}[#method-i-3C-3D-3E]:
11595  *
11596  *    :aBcDeF.casecmp(:abcde)   # => 1
11597  *    :aBcDeF.casecmp(:abcdef)  # => 0
11598  *    :aBcDeF.casecmp(:abcdefg) # => -1
11599  *    :abcdef.casecmp(:ABCDEF)  # => 0
11600  *
11601  *  Returns +nil+ if the two symbols have incompatible encodings,
11602  *  or if +other_symbol+ is not a symbol:
11603  *
11604  *    sym = "\u{e4 f6 fc}".encode("ISO-8859-1").to_sym
11605  *    other_sym = :"\u{c4 d6 dc}"
11606  *    sym.casecmp(other_sym) # => nil
11607  *    :foo.casecmp(2)        # => nil
11608  *
11609  *  Currently, case-insensitivity only works on characters A-Z/a-z,
11610  *  not all of Unicode. This is different from Symbol#casecmp?.
11611  *
11612  *  Related: Symbol#casecmp?.
11613  *
11614  */
11615
11616 static VALUE
11617 sym_casecmp(VALUE sym, VALUE other)
11618 {
11619     if (!SYMBOL_P(other)) {
11620         return Qnil;
11621     }
11622     return str_casecmp(rb_sym2str(sym), rb_sym2str(other));
11623 }
11624
11625 /*
11626  *  call-seq:
11627  *    casecmp?(other_symbol) -> true, false, or nil
11628  *
11629  *  Returns +true+ if +sym+ and +other_symbol+ are equal after
11630  *  Unicode case folding, +false+ if they are not equal:
11631  *
11632  *    :aBcDeF.casecmp?(:abcde)                  # => false
11633  *    :aBcDeF.casecmp?(:abcdef)                 # => true
11634  *    :aBcDeF.casecmp?(:abcdefg)                # => false
11635  *    :abcdef.casecmp?(:ABCDEF)                 # => true
11636  *    :"\u{e4 f6 fc}".casecmp?(:"\u{c4 d6 dc}") #=> true
11637  *
11638  *  Returns +nil+ if the two symbols have incompatible encodings,
11639  *  or if +other_symbol+ is not a symbol:
11640  *
11641  *    sym = "\u{e4 f6 fc}".encode("ISO-8859-1").to_sym
11642  *    other_sym = :"\u{c4 d6 dc}"
11643  *    sym.casecmp?(other_sym) # => nil
11644  *    :foo.casecmp?(2)        # => nil
11645  *
11646  *  See {Case Mapping}[doc/case_mapping_rdoc.html].
11647  *
11648  *  Related: Symbol#casecmp.
11649  *
11650  */
11651
11652 static VALUE
11653 sym_casecmp_p(VALUE sym, VALUE other)
11654 {
11655     if (!SYMBOL_P(other)) {
11656         return Qnil;
11657     }
11658     return str_casecmp_p(rb_sym2str(sym), rb_sym2str(other));
11659 }
11660
11661 /*
11662  * call-seq:
11663  *   sym =~ obj   -> integer or nil
11664  *
11665  * Returns <code>sym.to_s =~ obj</code>.
11666  */
11667
11668 static VALUE
11669 sym_match(VALUE sym, VALUE other)
11670 {
11671     return rb_str_match(rb_sym2str(sym), other);
11672 }
11673
11674 /*
11675  * call-seq:
11676  *   sym.match(pattern)        -> matchdata or nil
11677  *   sym.match(pattern, pos)   -> matchdata or nil
11678  *
11679  * Returns <code>sym.to_s.match</code>.
11680  */
11681
11682 static VALUE
11683 sym_match_m(int argc, VALUE *argv, VALUE sym)
11684 {
11685     return rb_str_match_m(argc, argv, rb_sym2str(sym));
11686 }
11687
11688 /*
11689  * call-seq:
11690  *   sym.match?(pattern)        -> true or false
11691  *   sym.match?(pattern, pos)   -> true or false
11692  *
11693  * Returns <code>sym.to_s.match?</code>.
11694  */
11695
11696 static VALUE
11697 sym_match_m_p(int argc, VALUE *argv, VALUE sym)
11698 {
11699     return rb_str_match_m_p(argc, argv, sym);
11700 }
11701
11702 /*
11703  * call-seq:
11704  *   sym[idx]      -> char
11705  *   sym[b, n]     -> string
11706  *   sym.slice(idx)      -> char
11707  *   sym.slice(b, n)     -> string
11708  *
11709  * Returns <code>sym.to_s[]</code>.
11710  */
11711
11712 static VALUE
11713 sym_aref(int argc, VALUE *argv, VALUE sym)
11714 {
11715     return rb_str_aref_m(argc, argv, rb_sym2str(sym));
11716 }
11717
11718 /*
11719  * call-seq:
11720  *   sym.length   -> integer
11721  *   sym.size     -> integer
11722  *
11723  * Same as <code>sym.to_s.length</code>.
11724  */
11725
11726 static VALUE
11727 sym_length(VALUE sym)
11728 {
11729     return rb_str_length(rb_sym2str(sym));
11730 }
11731
11732 /*
11733  * call-seq:
11734  *   sym.empty?   -> true or false
11735  *
11736  * Returns whether _sym_ is :"" or not.
11737  */
11738
11739 static VALUE
11740 sym_empty(VALUE sym)
11741 {
11742     return rb_str_empty(rb_sym2str(sym));
11743 }
11744
11745 /*
11746  *  call-seq:
11747  *    upcase(*options) -> symbol
11748  *
11749  *  Equivalent to <tt>sym.to_s.upcase.to_sym</tt>.
11750  *
11751  *  See String#upcase.
11752  *
11753  */
11754
11755 static VALUE
11756 sym_upcase(int argc, VALUE *argv, VALUE sym)
11757 {
11758     return rb_str_intern(rb_str_upcase(argc, argv, rb_sym2str(sym)));
11759 }
11760
11761 /*
11762  *  call-seq:
11763  *    downcase(*options) -> symbol
11764  *
11765  *  Equivalent to <tt>sym.to_s.downcase.to_sym</tt>.
11766  *
11767  *  See String#downcase.
11768  *
11769  *  Related: Symbol#upcase.
11770  *
11771  */
11772
11773 static VALUE
11774 sym_downcase(int argc, VALUE *argv, VALUE sym)
11775 {
11776     return rb_str_intern(rb_str_downcase(argc, argv, rb_sym2str(sym)));
11777 }
11778
11779 /*
11780  *  call-seq:
11781  *    capitalize(*options) -> symbol
11782  *
11783  *  Equivalent to <tt>sym.to_s.capitalize.to_sym</tt>.
11784  *
11785  *  See String#capitalize.
11786  *
11787  */
11788
11789 static VALUE
11790 sym_capitalize(int argc, VALUE *argv, VALUE sym)
11791 {
11792     return rb_str_intern(rb_str_capitalize(argc, argv, rb_sym2str(sym)));
11793 }
11794
11795 /*
11796  *  call-seq:
11797  *    swapcase(*options) -> symbol
11798  *
11799  *  Equivalent to <tt>sym.to_s.swapcase.to_sym</tt>.
11800  *
11801  *  See String#swapcase.
11802  *
11803  */
11804
11805 static VALUE
11806 sym_swapcase(int argc, VALUE *argv, VALUE sym)
11807 {
11808     return rb_str_intern(rb_str_swapcase(argc, argv, rb_sym2str(sym)));
11809 }
11810
11811 /*
11812  *  call-seq:
11813  *     sym.start_with?([prefixes]+)   -> true or false
11814  *
11815  *  Returns true if +sym+ starts with one of the +prefixes+ given.
11816  *  Each of the +prefixes+ should be a String or a Regexp.
11817  *
11818  *    :hello.start_with?("hell")               #=> true
11819  *    :hello.start_with?(/H/i)                 #=> true
11820  *
11821  *    # returns true if one of the prefixes matches.
11822  *    :hello.start_with?("heaven", "hell")     #=> true
11823  *    :hello.start_with?("heaven", "paradise") #=> false
11824  */
11825
11826 static VALUE
11827 sym_start_with(int argc, VALUE *argv, VALUE sym)
11828 {
11829     return rb_str_start_with(argc, argv, rb_sym2str(sym));
11830 }
11831
11832 /*
11833  *  call-seq:
11834  *     sym.end_with?([suffixes]+)   -> true or false
11835  *
11836  *  Returns true if +sym+ ends with one of the +suffixes+ given.
11837  *
11838  *    :hello.end_with?("ello")               #=> true
11839  *
11840  *    # returns true if one of the +suffixes+ matches.
11841  *    :hello.end_with?("heaven", "ello")     #=> true
11842  *    :hello.end_with?("heaven", "paradise") #=> false
11843  */
11844
11845 static VALUE
11846 sym_end_with(int argc, VALUE *argv, VALUE sym)
11847 {
11848     return rb_str_end_with(argc, argv, rb_sym2str(sym));
11849 }
11850
11851 /*
11852  * call-seq:
11853  *   sym.encoding   -> encoding
11854  *
11855  * Returns the Encoding object that represents the encoding of _sym_.
11856  */
11857
11858 static VALUE
11859 sym_encoding(VALUE sym)
11860 {
11861     return rb_obj_encoding(rb_sym2str(sym));
11862 }
11863
11864 static VALUE
11865 string_for_symbol(VALUE name)
11866 {
11867     if (!RB_TYPE_P(name, T_STRING)) {
11868         VALUE tmp = rb_check_string_type(name);
11869         if (NIL_P(tmp)) {
11870             rb_raise(rb_eTypeError, "%+"PRIsVALUE" is not a symbol",
11871                      name);
11872         }
11873         name = tmp;
11874     }
11875     return name;
11876 }
11877
11878 ID
11879 rb_to_id(VALUE name)
11880 {
11881     if (SYMBOL_P(name)) {
11882         return SYM2ID(name);
11883     }
11884     name = string_for_symbol(name);
11885     return rb_intern_str(name);
11886 }
11887
11888 VALUE
11889 rb_to_symbol(VALUE name)
11890 {
11891     if (SYMBOL_P(name)) {
11892         return name;
11893     }
11894     name = string_for_symbol(name);
11895     return rb_str_intern(name);
11896 }
11897
11898 /*
11899  *  call-seq:
11900  *     Symbol.all_symbols    => array
11901  *
11902  *  Returns an array of all the symbols currently in Ruby's symbol
11903  *  table.
11904  *
11905  *     Symbol.all_symbols.size    #=> 903
11906  *     Symbol.all_symbols[1,20]   #=> [:floor, :ARGV, :Binding, :symlink,
11907  *                                     :chown, :EOFError, :$;, :String,
11908  *                                     :LOCK_SH, :"setuid?", :$<,
11909  *                                     :default_proc, :compact, :extend,
11910  *                                     :Tms, :getwd, :$=, :ThreadGroup,
11911  *                                     :wait2, :$>]
11912  */
11913
11914 static VALUE
11915 sym_all_symbols(VALUE _)
11916 {
11917     return rb_sym_all_symbols();
11918 }
11919
11920 VALUE
11921 rb_str_to_interned_str(VALUE str)
11922 {
11923     return rb_fstring(str);
11924 }
11925
11926 VALUE
11927 rb_interned_str(const char *ptr, long len)
11928 {
11929     struct RString fake_str;
11930     return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), TRUE);
11931 }
11932
11933 VALUE
11934 rb_interned_str_cstr(const char *ptr)
11935 {
11936     return rb_interned_str(ptr, strlen(ptr));
11937 }
11938
11939 VALUE
11940 rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
11941 {
11942     if (UNLIKELY(rb_enc_autoload_p(enc))) {
11943         rb_enc_autoload(enc);
11944     }
11945
11946     struct RString fake_str;
11947     return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), TRUE);
11948 }
11949
11950 VALUE
11951 rb_enc_interned_str_cstr(const char *ptr, rb_encoding *enc)
11952 {
11953     return rb_enc_interned_str(ptr, strlen(ptr), enc);
11954 }
11955
11956 /*
11957  *  A \String object has an arbitrary sequence of bytes,
11958  *  typically representing text or binary data.
11959  *  A \String object may be created using String::new or as literals.
11960  *
11961  *  String objects differ from Symbol objects in that Symbol objects are
11962  *  designed to be used as identifiers, instead of text or data.
11963  *
11964  *  You can create a \String object explicitly with:
11965  *
11966  *  - A {string literal}[doc/syntax/literals_rdoc.html#label-String+Literals].
11967  *  - A {heredoc literal}[doc/syntax/literals_rdoc.html#label-Here+Document+Literals].
11968  *
11969  *  You can convert certain objects to Strings with:
11970  *
11971  *  - \Method {String}[Kernel.html#method-i-String].
11972  *
11973  *  Some \String methods modify +self+.
11974  *  Typically, a method whose name ends with <tt>!</tt> modifies +self+
11975  *  and returns +self+;
11976  *  often a similarly named method (without the <tt>!</tt>)
11977  *  returns a new string.
11978  *
11979  *  In general, if there exist both bang and non-bang version of method,
11980  *  the bang! mutates and the non-bang! does not.
11981  *  However, a method without a bang can also mutate, such as String#replace.
11982  *
11983  *  == Substitution Methods
11984  *
11985  *  These methods perform substitutions:
11986  *
11987  *  - String#sub: One substitution (or none); returns a new string.
11988  *  - String#sub!: One substitution (or none); returns +self+.
11989  *  - String#gsub: Zero or more substitutions; returns a new string.
11990  *  - String#gsub!: Zero or more substitutions; returns +self+.
11991  *
11992  *  Each of these methods takes:
11993  *
11994  *  - A first argument, +pattern+ (string or regexp),
11995  *    that specifies the substring(s) to be replaced.
11996  *
11997  *  - Either of these:
11998  *
11999  *    - A second argument, +replacement+ (string or hash),
12000  *      that determines the replacing string.
12001  *    - A block that will determine the replacing string.
12002  *
12003  *  The examples in this section mostly use methods String#sub and String#gsub;
12004  *  the principles illustrated apply to all four substitution methods.
12005  *
12006  *  <b>Argument +pattern+</b>
12007  *
12008  *  Argument +pattern+ is commonly a regular expression:
12009  *
12010  *    s = 'hello'
12011  *    s.sub(/[aeiou]/, '*')  # => "h*llo"
12012  *    s.gsub(/[aeiou]/, '*') # => "h*ll*"
12013  *    s.gsub(/[aeiou]/, '')  # => "hll"
12014  *    s.sub(/ell/, 'al')     # => "halo"
12015  *    s.gsub(/xyzzy/, '*')   # => "hello"
12016  *    'THX1138'.gsub(/\d+/, '00') # => "THX00"
12017  *
12018  *  When +pattern+ is a string, all its characters are treated
12019  *  as ordinary characters (not as regexp special characters):
12020  *
12021  *    'THX1138'.gsub('\d+', '00') # => "THX1138"
12022  *
12023  *  <b>\String +replacement+</b>
12024  *
12025  *  If +replacement+ is a string, that string will determine
12026  *  the replacing string that is to be substituted for the matched text.
12027  *
12028  *  Each of the examples above uses a simple string as the replacing string.
12029  *
12030  *  \String +replacement+ may contain back-references to the pattern's captures:
12031  *
12032  *  - <tt>\n</tt> (_n_ a non-negative integer) refers to <tt>$n</tt>.
12033  *  - <tt>\k<name></tt> refers to the named capture +name+.
12034  *
12035  *  See rdoc-ref:regexp.rdoc for details.
12036  *
12037  *  Note that within the string +replacement+, a character combination
12038  *  such as <tt>$&</tt> is treated as ordinary text, and not as
12039  *  a special match variable.
12040  *  However, you may refer to some special match variables using these
12041  *  combinations:
12042  *
12043  *  - <tt>\&</tt> and <tt>\0</tt> correspond to <tt>$&</tt>,
12044  *    which contains the complete matched text.
12045  *  - <tt>\'</tt> corresponds to <tt>$'</tt>,
12046  *    which contains string after match.
12047  *  - <tt>\`</tt> corresponds to <tt>$`</tt>,
12048  *    which contains string before match.
12049  *  - <tt>\+</tt> corresponds to <tt>$+</tt>,
12050  *    which contains last capture group.
12051  *
12052  *  See rdoc-ref:regexp.rdoc for details.
12053  *
12054  *  Note that <tt>\\\\</tt> is interpreted as an escape, i.e., a single backslash.
12055  *
12056  *  Note also that a string literal consumes backslashes.
12057  *  See {String Literals}[doc/syntax/literals_rdoc.html#label-String+Literals] for details about string literals.
12058  *
12059  *  A back-reference is typically preceded by an additional backslash.
12060  *  For example, if you want to write a back-reference <tt>\&</tt> in
12061  *  +replacement+ with a double-quoted string literal, you need to write
12062  *  <tt>"..\\\\&.."</tt>.
12063  *
12064  *  If you want to write a non-back-reference string <tt>\&</tt> in
12065  *  +replacement+, you need first to escape the backslash to prevent
12066  *  this method from interpreting it as a back-reference, and then you
12067  *  need to escape the backslashes again to prevent a string literal from
12068  *  consuming them: <tt>"..\\\\\\\\&.."</tt>.
12069  *
12070  *  You may want to use the block form to avoid a lot of backslashes.
12071  *
12072  *  <b>\Hash +replacement+</b>
12073  *
12074  *  If argument +replacement+ is a hash, and +pattern+ matches one of its keys,
12075  *  the replacing string is the value for that key:
12076  *
12077  *    h = {'foo' => 'bar', 'baz' => 'bat'}
12078  *    'food'.sub('foo', h) # => "bard"
12079  *
12080  *  Note that a symbol key does not match:
12081  *
12082  *    h = {foo: 'bar', baz: 'bat'}
12083  *    'food'.sub('foo', h) # => "d"
12084  *
12085  *  <b>Block</b>
12086  *
12087  *  In the block form, the current match string is passed to the block;
12088  *  the block's return value becomes the replacing string:
12089  *
12090  *    s = '@'
12091  *   '1234'.gsub(/\d/) {|match| s.succ! } # => "ABCD"
12092  *
12093  *  Special match variables such as <tt>$1</tt>, <tt>$2</tt>, <tt>$`</tt>,
12094  *  <tt>$&</tt>, and <tt>$'</tt> are set appropriately.
12095  *
12096  *
12097  *  == What's Here
12098  *
12099  *  First, what's elsewhere. \Class \String:
12100  *
12101  *  - Inherits from {class Object}[Object.html#class-Object-label-What-27s+Here].
12102  *  - Includes {module Comparable}[Comparable.html#module-Comparable-label-What-27s+Here].
12103  *
12104  *  Here, class \String provides methods that are useful for:
12105  *
12106  *  - {Creating a String}[#class-String-label-Methods+for+Creating+a+String]
12107  *  - {Frozen/Unfrozen Strings}[#class-String-label-Methods+for+a+Frozen-2FUnfrozen+String]
12108  *  - {Querying}[#class-String-label-Methods+for+Querying]
12109  *  - {Comparing}[#class-String-label-Methods+for+Comparing]
12110  *  - {Modifying a String}[#class-String-label-Methods+for+Modifying+a+String]
12111  *  - {Converting to New String}[#class-String-label-Methods+for+Converting+to+New+String]
12112  *  - {Converting to Non-String}[#class-String-label-Methods+for+Converting+to+Non--5CString]
12113  *  - {Iterating}[#class-String-label-Methods+for+Iterating]
12114  *
12115  *  === Methods for Creating a \String
12116  *
12117  *  - ::new:: Returns a new string.
12118  *  - ::try_convert:: Returns a new string created from a given object.
12119  *
12120  *  === Methods for a Frozen/Unfrozen String
12121  *
12122  *  - {#+string}[#method-i-2B-40]:: Returns a string that is not frozen:
12123  *                                  +self+, if not frozen; +self.dup+ otherwise.
12124  *  - {#-string}[#method-i-2D-40]:: Returns a string that is frozen:
12125  *                                  +self+, if already frozen; +self.freeze+ otherwise.
12126  *  - #freeze:: Freezes +self+, if not already frozen; returns +self+.
12127  *
12128  *  === Methods for Querying
12129  *
12130  *  _Counts_
12131  *
12132  *  - #length, #size:: Returns the count of characters (not bytes).
12133  *  - #empty?:: Returns +true+ if +self.length+ is zero; +false+ otherwise.
12134  *  - #bytesize:: Returns the count of bytes.
12135  *  - #count:: Returns the count of substrings matching given strings.
12136  *
12137  *  _Substrings_
12138  *
12139  *  - {#=~}[#method-i-3D~]:: Returns the index of the first substring that matches a given Regexp or other object;
12140  *                           returns +nil+ if no match is found.
12141  *  - #index:: Returns the index of the _first_ occurrence of a given substring;
12142  *             returns +nil+ if none found.
12143  *  - #rindex:: Returns the index of the _last_ occurrence of a given substring;
12144  *              returns +nil+ if none found.
12145  *  - #include?:: Returns +true+ if the string contains a given substring; +false+ otherwise.
12146  *  - #match:: Returns a MatchData object if the string matches a given Regexp; +nil+ otherwise.
12147  *  - #match?:: Returns +true+ if the string matches a given Regexp; +false+ otherwise.
12148  *  - #start_with?:: Returns +true+ if the string begins with any of the given substrings.
12149  *  - #end_with?:: Returns +true+ if the string ends with any of the given substrings.
12150  *
12151  *  _Encodings_
12152  *
12153  *  - #encoding:: Returns the Encoding object that represents the encoding of the string.
12154  *  - #unicode_normalized?:: Returns +true+ if the string is in Unicode normalized form; +false+ otherwise.
12155  *  - #valid_encoding?:: Returns +true+ if the string contains only characters that are valid
12156  *                       for its encoding.
12157  *  - #ascii_only?:: Returns +true+ if the string has only ASCII characters; +false+ otherwise.
12158  *
12159  *  _Other_
12160  *
12161  *  - #sum:: Returns a basic checksum for the string: the sum of each byte.
12162  *  - #hash:: Returns the integer hash code.
12163  *
12164  *  === Methods for Comparing
12165  *
12166  *  - {#==, #===}[#method-i-3D-3D]:: Returns +true+ if a given other string has the same content as +self+.
12167  *  - #eql?:: Returns +true+ if the content is the same as the given other string.
12168  *  - {#<=>}[#method-i-3C-3D-3E]:: Returns -1, 0, or 1 as a given other string is smaller than, equal to, or larger than +self+.
12169  *  - #casecmp:: Ignoring case, returns -1, 0, or 1 as a given
12170  *               other string is smaller than, equal to, or larger than +self+.
12171  *  - #casecmp?:: Returns +true+ if the string is equal to a given string after Unicode case folding;
12172  *                +false+ otherwise.
12173  *
12174  *  === Methods for Modifying a \String
12175  *
12176  *  Each of these methods modifies +self+.
12177  *
12178  *  _Insertion_
12179  *
12180  *  - #insert:: Returns +self+ with a given string inserted at a given offset.
12181  *  - #<<:: Returns +self+ concatenated with a given string or integer.
12182  *
12183  *  _Substitution_
12184  *
12185  *  - #sub!:: Replaces the first substring that matches a given pattern with a given replacement string;
12186  *            returns +self+ if any changes, +nil+ otherwise.
12187  *  - #gsub!:: Replaces each substring that matches a given pattern with a given replacement string;
12188  *             returns +self+ if any changes, +nil+ otherwise.
12189  *  - #succ!, #next!:: Returns +self+ modified to become its own successor.
12190  *  - #replace:: Returns +self+ with its entire content replaced by a given string.
12191  *  - #reverse!:: Returns +self+ with its characters in reverse order.
12192  *  - #setbyte:: Sets the byte at a given integer offset to a given value; returns the argument.
12193  *  - #tr!:: Replaces specified characters in +self+ with specified replacement characters;
12194  *           returns +self+ if any changes, +nil+ otherwise.
12195  *  - #tr_s!:: Replaces specified characters in +self+ with specified replacement characters,
12196  *             removing duplicates from the substrings that were modified;
12197  *             returns +self+ if any changes, +nil+ otherwise.
12198  *
12199  *  _Casing_
12200  *
12201  *  - #capitalize!:: Upcases the initial character and downcases all others;
12202  *                   returns +self+ if any changes, +nil+ otherwise.
12203  *  - #downcase!:: Downcases all characters; returns +self+ if any changes, +nil+ otherwise.
12204  *  - #upcase!:: Upcases all characters; returns +self+ if any changes, +nil+ otherwise.
12205  *  - #swapcase!:: Upcases each downcase character and downcases each upcase character;
12206  *                 returns +self+ if any changes, +nil+ otherwise.
12207  *
12208  *  _Encoding_
12209  *
12210  *  - #encode!:: Returns +self+ with all characters transcoded from one given encoding into another.
12211  *  - #unicode_normalize!:: Unicode-normalizes +self+; returns +self+.
12212  *  - #scrub!:: Replaces each invalid byte with a given character; returns +self+.
12213  *  - #force_encoding:: Changes the encoding to a given encoding; returns +self+.
12214  *
12215  *  _Deletion_
12216  *
12217  *  - #clear:: Removes all content, so that +self+ is empty; returns +self+.
12218  *  - #slice!, #[]=:: Removes a substring determined by a given index, start/length, range, regexp, or substring.
12219  *  - #squeeze!:: Removes contiguous duplicate characters; returns +self+.
12220  *  - #delete!:: Removes characters as determined by the intersection of substring arguments.
12221  *  - #lstrip!:: Removes leading whitespace; returns +self+ if any changes, +nil+ otherwise.
12222  *  - #rstrip!:: Removes trailing whitespace; returns +self+ if any changes, +nil+ otherwise.
12223  *  - #strip!:: Removes leading and trailing whitespace; returns +self+ if any changes, +nil+ otherwise.
12224  *  - #chomp!:: Removes trailing record separator, if found; returns +self+ if any changes, +nil+ otherwise.
12225  *  - #chop!:: Removes trailing whitespace if found, otherwise removes the last character;
12226  *             returns +self+ if any changes, +nil+ otherwise.
12227  *
12228  *  === Methods for Converting to New \String
12229  *
12230  *  Each of these methods returns a new \String based on +self+,
12231  *  often just a modified copy of +self+.
12232  *
12233  *  _Extension_
12234  *
12235  *  - #*:: Returns the concatenation of multiple copies of +self+,
12236  *  - #+:: Returns the concatenation of +self+ and a given other string.
12237  *  - #center:: Returns a copy of +self+ centered between pad substring.
12238  *  - #concat:: Returns the concatenation of +self+ with given other strings.
12239  *  - #prepend:: Returns the concatenation of a given other string with +self+.
12240  *  - #ljust:: Returns a copy of +self+ of a given length, right-padded with a given other string.
12241  *  - #rjust:: Returns a copy of +self+ of a given length, left-padded with a given other string.
12242  *
12243  *  _Encoding_
12244  *
12245  *  - #b:: Returns a copy of +self+ with ASCII-8BIT encoding.
12246  *  - #scrub:: Returns a copy of +self+ with each invalid byte replaced with a given character.
12247  *  - #unicode_normalize:: Returns a copy of +self+ with each character Unicode-normalized.
12248  *  - #encode:: Returns a copy of +self+ with all characters transcoded from one given encoding into another.
12249  *
12250  *  _Substitution_
12251  *
12252  *  - #dump:: Returns a copy of +self with all non-printing characters replaced by \xHH notation
12253  *            and all special characters escaped.
12254  *  - #undump:: Returns a copy of +self with all <tt>\xNN</tt> notation replace by <tt>\uNNNN</tt> notation
12255  *              and all escaped characters unescaped.
12256  *  - #sub:: Returns a copy of +self+ with the first substring matching a given pattern
12257  *           replaced with a given replacement string;.
12258  *  - #gsub:: Returns a copy of +self+ with each substring that matches a given pattern
12259  *            replaced with a given replacement string.
12260  *  - #succ, #next:: Returns the string that is the successor to +self+.
12261  *  - #reverse:: Returns a copy of +self+ with its characters in reverse order.
12262  *  - #tr:: Returns a copy of +self+ with specified characters replaced with specified replacement characters.
12263  *  - #tr_s:: Returns a copy of +self+ with specified characters replaced with specified replacement characters,
12264  *            removing duplicates from the substrings that were modified.
12265  *  - #%:: Returns the string resulting from formatting a given object into +self+
12266  *
12267  *  _Casing_
12268  *
12269  *  - #capitalize:: Returns a copy of +self+ with the first character upcased
12270  *                  and all other characters downcased.
12271  *  - #downcase:: Returns a copy of +self+ with all characters downcased.
12272  *  - #upcase:: Returns a copy of +self+ with all characters upcased.
12273  *  - #swapcase:: Returns a copy of +self+ with all upcase characters downcased
12274  *                and all downcase characters upcased.
12275  *
12276  *  _Deletion_
12277  *
12278  *  - #delete:: Returns a copy of +self+ with characters removed
12279  *  - #delete_prefix:: Returns a copy of +self+ with a given prefix removed.
12280  *  - #delete_suffix:: Returns a copy of +self+ with a given suffix removed.
12281  *  - #lstrip:: Returns a copy of +self+ with leading whitespace removed.
12282  *  - #rstrip:: Returns a copy of +self+ with trailing whitespace removed.
12283  *  - #strip:: Returns a copy of +self+ with leading and trailing whitespace removed.
12284  *  - #chomp:: Returns a copy of +self+ with a trailing record separator removed, if found.
12285  *  - #chop:: Returns a copy of +self+ with trailing whitespace or the last character removed.
12286  *  - #squeeze:: Returns a copy of +self+ with contiguous duplicate characters removed.
12287  *  - #[], #slice:: Returns a substring determined by a given index, start/length, or range, or string.
12288  *  - #byteslice:: Returns a substring determined by a given index, start/length, or range.
12289  *  - #chr:: Returns the first character.
12290  *
12291  *  _Duplication_
12292  *
12293  *  - #to_s, $to_str:: If +self+ is a subclass of \String, returns +self+ copied into a \String;
12294  *                     otherwise, returns +self+.
12295  *
12296  *  === Methods for Converting to Non-\String
12297  *
12298  *  Each of these methods converts the contents of +self+ to a non-\String.
12299  *
12300  *  <em>Characters, Bytes, and Clusters</em>
12301  *
12302  *  - #bytes:: Returns an array of the bytes in +self+.
12303  *  - #chars:: Returns an array of the characters in +self+.
12304  *  - #codepoints:: Returns an array of the integer ordinals in +self+.
12305  *  - #getbyte:: Returns an integer byte as determined by a given index.
12306  *  - #grapheme_clusters:: Returns an array of the grapheme clusters in +self+.
12307  *
12308  *  _Splitting_
12309  *
12310  *  - #lines:: Returns an array of the lines in +self+, as determined by a given record separator.
12311  *  - #partition:: Returns a 3-element array determined by the first substring that matches
12312  *                 a given substring or regexp,
12313  *  - #rpartition:: Returns a 3-element array determined by the last substring that matches
12314  *                  a given substring or regexp,
12315  *  - #split:: Returns an array of substrings determined by a given delimiter -- regexp or string --
12316  *             or, if a block given, passes those substrings to the block.
12317  *
12318  *  _Matching_
12319  *
12320  *  - #scan:: Returns an array of substrings matching a given regexp or string, or,
12321  *            if a block given, passes each matching substring to the  block.
12322  *  - #unpack:: Returns an array of substrings extracted from +self+ according to a given format.
12323  *  - #unpack1:: Returns the first substring extracted from +self+ according to a given format.
12324  *
12325  *  _Numerics_
12326  *
12327  *  - #hex:: Returns the integer value of the leading characters, interpreted as hexadecimal digits.
12328  *  - #oct:: Returns the integer value of the leading characters, interpreted as octal digits.
12329  *  - #ord:: Returns the integer ordinal of the first character in +self+.
12330  *  - #to_i:: Returns the integer value of leading characters, interpreted as an integer.
12331  *  - #to_f:: Returns the floating-point value of leading characters, interpreted as a floating-point number.
12332  *
12333  *  <em>Strings and Symbols</em>
12334  *
12335  *  - #inspect:: Returns copy of +self+, enclosed in double-quotes, with special characters escaped.
12336  *  - #to_sym, #intern:: Returns the symbol corresponding to +self+.
12337  *
12338  *  === Methods for Iterating
12339  *
12340  *  - #each_byte:: Calls the given block with each successive byte in +self+.
12341  *  - #each_char:: Calls the given block with each successive character in +self+.
12342  *  - #each_codepoint:: Calls the given block with each successive integer codepoint in +self+.
12343  *  - #each_grapheme_cluster:: Calls the given block with each successive grapheme cluster in +self+.
12344  *  - #each_line:: Calls the given block with each successive line in +self+,
12345  *                 as determined by a given record separator.
12346  *  - #upto:: Calls the given block with each string value returned by successive calls to #succ.
12347  */
12348
12349 void
12350 Init_String(void)
12351 {
12352     rb_cString  = rb_define_class("String", rb_cObject);
12353     assert(rb_vm_fstring_table());
12354     st_foreach(rb_vm_fstring_table(), fstring_set_class_i, rb_cString);
12355     rb_include_module(rb_cString, rb_mComparable);
12356     rb_define_alloc_func(rb_cString, empty_str_alloc);
12357     rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1);
12358     rb_define_method(rb_cString, "initialize", rb_str_init, -1);
12359     rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
12360     rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1);
12361     rb_define_method(rb_cString, "==", rb_str_equal, 1);
12362     rb_define_method(rb_cString, "===", rb_str_equal, 1);
12363     rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
12364     rb_define_method(rb_cString, "hash", rb_str_hash_m, 0);
12365     rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
12366     rb_define_method(rb_cString, "casecmp?", rb_str_casecmp_p, 1);
12367     rb_define_method(rb_cString, "+", rb_str_plus, 1);
12368     rb_define_method(rb_cString, "*", rb_str_times, 1);
12369     rb_define_method(rb_cString, "%", rb_str_format_m, 1);
12370     rb_define_method(rb_cString, "[]", rb_str_aref_m, -1);
12371     rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1);
12372     rb_define_method(rb_cString, "insert", rb_str_insert, 2);
12373     rb_define_method(rb_cString, "length", rb_str_length, 0);
12374     rb_define_method(rb_cString, "size", rb_str_length, 0);
12375     rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
12376     rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
12377     rb_define_method(rb_cString, "=~", rb_str_match, 1);
12378     rb_define_method(rb_cString, "match", rb_str_match_m, -1);
12379     rb_define_method(rb_cString, "match?", rb_str_match_m_p, -1);
12380     rb_define_method(rb_cString, "succ", rb_str_succ, 0);
12381     rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0);
12382     rb_define_method(rb_cString, "next", rb_str_succ, 0);
12383     rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0);
12384     rb_define_method(rb_cString, "upto", rb_str_upto, -1);
12385     rb_define_method(rb_cString, "index", rb_str_index_m, -1);
12386     rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1);
12387     rb_define_method(rb_cString, "replace", rb_str_replace, 1);
12388     rb_define_method(rb_cString, "clear", rb_str_clear, 0);
12389     rb_define_method(rb_cString, "chr", rb_str_chr, 0);
12390     rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
12391     rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
12392     rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1);
12393     rb_define_method(rb_cString, "scrub", str_scrub, -1);
12394     rb_define_method(rb_cString, "scrub!", str_scrub_bang, -1);
12395     rb_define_method(rb_cString, "freeze", rb_str_freeze, 0);
12396     rb_define_method(rb_cString, "+@", str_uplus, 0);
12397     rb_define_method(rb_cString, "-@", str_uminus, 0);
12398
12399     rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
12400     rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
12401     rb_define_method(rb_cString, "to_s", rb_str_to_s, 0);
12402     rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
12403     rb_define_method(rb_cString, "inspect", rb_str_inspect, 0);
12404     rb_define_method(rb_cString, "dump", rb_str_dump, 0);
12405     rb_define_method(rb_cString, "undump", str_undump, 0);
12406
12407     sym_ascii      = ID2SYM(rb_intern_const("ascii"));
12408     sym_turkic     = ID2SYM(rb_intern_const("turkic"));
12409     sym_lithuanian = ID2SYM(rb_intern_const("lithuanian"));
12410     sym_fold       = ID2SYM(rb_intern_const("fold"));
12411
12412     rb_define_method(rb_cString, "upcase", rb_str_upcase, -1);
12413     rb_define_method(rb_cString, "downcase", rb_str_downcase, -1);
12414     rb_define_method(rb_cString, "capitalize", rb_str_capitalize, -1);
12415     rb_define_method(rb_cString, "swapcase", rb_str_swapcase, -1);
12416
12417     rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, -1);
12418     rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, -1);
12419     rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, -1);
12420     rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, -1);
12421
12422     rb_define_method(rb_cString, "hex", rb_str_hex, 0);
12423     rb_define_method(rb_cString, "oct", rb_str_oct, 0);
12424     rb_define_method(rb_cString, "split", rb_str_split_m, -1);
12425     rb_define_method(rb_cString, "lines", rb_str_lines, -1);
12426     rb_define_method(rb_cString, "bytes", rb_str_bytes, 0);
12427     rb_define_method(rb_cString, "chars", rb_str_chars, 0);
12428     rb_define_method(rb_cString, "codepoints", rb_str_codepoints, 0);
12429     rb_define_method(rb_cString, "grapheme_clusters", rb_str_grapheme_clusters, 0);
12430     rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
12431     rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
12432     rb_define_method(rb_cString, "concat", rb_str_concat_multi, -1);
12433     rb_define_method(rb_cString, "<<", rb_str_concat, 1);
12434     rb_define_method(rb_cString, "prepend", rb_str_prepend_multi, -1);
12435     rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
12436     rb_define_method(rb_cString, "intern", rb_str_intern, 0); /* in symbol.c */
12437     rb_define_method(rb_cString, "to_sym", rb_str_intern, 0); /* in symbol.c */
12438     rb_define_method(rb_cString, "ord", rb_str_ord, 0);
12439
12440     rb_define_method(rb_cString, "include?", rb_str_include, 1);
12441     rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
12442     rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
12443
12444     rb_define_method(rb_cString, "scan", rb_str_scan, 1);
12445
12446     rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
12447     rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
12448     rb_define_method(rb_cString, "center", rb_str_center, -1);
12449
12450     rb_define_method(rb_cString, "sub", rb_str_sub, -1);
12451     rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
12452     rb_define_method(rb_cString, "chop", rb_str_chop, 0);
12453     rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
12454     rb_define_method(rb_cString, "strip", rb_str_strip, 0);
12455     rb_define_method(rb_cString, "lstrip", rb_str_lstrip, 0);
12456     rb_define_method(rb_cString, "rstrip", rb_str_rstrip, 0);
12457     rb_define_method(rb_cString, "delete_prefix", rb_str_delete_prefix, 1);
12458     rb_define_method(rb_cString, "delete_suffix", rb_str_delete_suffix, 1);
12459
12460     rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1);
12461     rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1);
12462     rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0);
12463     rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1);
12464     rb_define_method(rb_cString, "strip!", rb_str_strip_bang, 0);
12465     rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, 0);
12466     rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, 0);
12467     rb_define_method(rb_cString, "delete_prefix!", rb_str_delete_prefix_bang, 1);
12468     rb_define_method(rb_cString, "delete_suffix!", rb_str_delete_suffix_bang, 1);
12469
12470     rb_define_method(rb_cString, "tr", rb_str_tr, 2);
12471     rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2);
12472     rb_define_method(rb_cString, "delete", rb_str_delete, -1);
12473     rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
12474     rb_define_method(rb_cString, "count", rb_str_count, -1);
12475
12476     rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2);
12477     rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2);
12478     rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1);
12479     rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1);
12480
12481     rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
12482     rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
12483     rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
12484     rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
12485     rb_define_method(rb_cString, "each_grapheme_cluster", rb_str_each_grapheme_cluster, 0);
12486
12487     rb_define_method(rb_cString, "sum", rb_str_sum, -1);
12488
12489     rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
12490     rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1);
12491
12492     rb_define_method(rb_cString, "partition", rb_str_partition, 1);
12493     rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
12494
12495     rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */
12496     rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
12497     rb_define_method(rb_cString, "b", rb_str_b, 0);
12498     rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
12499     rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0);
12500
12501     /* define UnicodeNormalize module here so that we don't have to look it up */
12502     mUnicodeNormalize          = rb_define_module("UnicodeNormalize");
12503     id_normalize               = rb_intern_const("normalize");
12504     id_normalized_p            = rb_intern_const("normalized?");
12505
12506     rb_define_method(rb_cString, "unicode_normalize", rb_str_unicode_normalize, -1);
12507     rb_define_method(rb_cString, "unicode_normalize!", rb_str_unicode_normalize_bang, -1);
12508     rb_define_method(rb_cString, "unicode_normalized?", rb_str_unicode_normalized_p, -1);
12509
12510     rb_fs = Qnil;
12511     rb_define_hooked_variable("$;", &rb_fs, 0, rb_fs_setter);
12512     rb_define_hooked_variable("$-F", &rb_fs, 0, rb_fs_setter);
12513     rb_gc_register_address(&rb_fs);
12514
12515     rb_cSymbol = rb_define_class("Symbol", rb_cObject);
12516     rb_include_module(rb_cSymbol, rb_mComparable);
12517     rb_undef_alloc_func(rb_cSymbol);
12518     rb_undef_method(CLASS_OF(rb_cSymbol), "new");
12519     rb_define_singleton_method(rb_cSymbol, "all_symbols", sym_all_symbols, 0);
12520
12521     rb_define_method(rb_cSymbol, "==", sym_equal, 1);
12522     rb_define_method(rb_cSymbol, "===", sym_equal, 1);
12523     rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
12524     rb_define_method(rb_cSymbol, "to_s", rb_sym_to_s, 0);
12525     rb_define_method(rb_cSymbol, "id2name", rb_sym_to_s, 0);
12526     rb_define_method(rb_cSymbol, "name", rb_sym2str, 0);
12527     rb_define_method(rb_cSymbol, "intern", sym_to_sym, 0);
12528     rb_define_method(rb_cSymbol, "to_sym", sym_to_sym, 0);
12529     rb_define_method(rb_cSymbol, "to_proc", rb_sym_to_proc, 0);
12530     rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
12531     rb_define_method(rb_cSymbol, "next", sym_succ, 0);
12532
12533     rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
12534     rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
12535     rb_define_method(rb_cSymbol, "casecmp?", sym_casecmp_p, 1);
12536     rb_define_method(rb_cSymbol, "=~", sym_match, 1);
12537
12538     rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
12539     rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
12540     rb_define_method(rb_cSymbol, "length", sym_length, 0);
12541     rb_define_method(rb_cSymbol, "size", sym_length, 0);
12542     rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
12543     rb_define_method(rb_cSymbol, "match", sym_match_m, -1);
12544     rb_define_method(rb_cSymbol, "match?", sym_match_m_p, -1);
12545
12546     rb_define_method(rb_cSymbol, "upcase", sym_upcase, -1);
12547     rb_define_method(rb_cSymbol, "downcase", sym_downcase, -1);
12548     rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, -1);
12549     rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, -1);
12550
12551     rb_define_method(rb_cSymbol, "start_with?", sym_start_with, -1);
12552     rb_define_method(rb_cSymbol, "end_with?", sym_end_with, -1);
12553
12554     rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);
12555 }