* 2022-01-18 [ci skip]
[ruby-80x24.org.git] / string.c
blob777f9fadc4d416915edc24b7658f385181d1a900
1 /**********************************************************************
3 string.c -
5 $Author$
6 created at: Mon Aug 9 17:12:58 JST 1993
8 Copyright (C) 1993-2007 Yukihiro Matsumoto
9 Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
10 Copyright (C) 2000 Information-technology Promotion Agency, Japan
12 **********************************************************************/
14 #include "ruby/internal/config.h"
16 #include <ctype.h>
17 #include <errno.h>
18 #include <math.h>
20 #ifdef HAVE_UNISTD_H
21 # include <unistd.h>
22 #endif
24 #include "debug_counter.h"
25 #include "encindex.h"
26 #include "gc.h"
27 #include "id.h"
28 #include "internal.h"
29 #include "internal/array.h"
30 #include "internal/compar.h"
31 #include "internal/compilers.h"
32 #include "internal/encoding.h"
33 #include "internal/error.h"
34 #include "internal/gc.h"
35 #include "internal/numeric.h"
36 #include "internal/object.h"
37 #include "internal/proc.h"
38 #include "internal/re.h"
39 #include "internal/sanitizers.h"
40 #include "internal/string.h"
41 #include "internal/transcode.h"
42 #include "probes.h"
43 #include "ruby/encoding.h"
44 #include "ruby/re.h"
45 #include "ruby/util.h"
46 #include "ruby_assert.h"
47 #include "vm_sync.h"
49 #if defined HAVE_CRYPT_R
50 # if defined HAVE_CRYPT_H
51 # include <crypt.h>
52 # endif
53 #elif !defined HAVE_CRYPT
54 # include "missing/crypt.h"
55 # define HAVE_CRYPT_R 1
56 #endif
58 #define BEG(no) (regs->beg[(no)])
59 #define END(no) (regs->end[(no)])
61 #undef rb_str_new
62 #undef rb_usascii_str_new
63 #undef rb_utf8_str_new
64 #undef rb_enc_str_new
65 #undef rb_str_new_cstr
66 #undef rb_usascii_str_new_cstr
67 #undef rb_utf8_str_new_cstr
68 #undef rb_enc_str_new_cstr
69 #undef rb_external_str_new_cstr
70 #undef rb_locale_str_new_cstr
71 #undef rb_str_dup_frozen
72 #undef rb_str_buf_new_cstr
73 #undef rb_str_buf_cat
74 #undef rb_str_buf_cat2
75 #undef rb_str_cat2
76 #undef rb_str_cat_cstr
77 #undef rb_fstring_cstr
79 VALUE rb_cString;
80 VALUE rb_cSymbol;
82 /* FLAGS of RString
84 * 1: RSTRING_NOEMBED
85 * 2: STR_SHARED (== ELTS_SHARED)
86 * 2-6: RSTRING_EMBED_LEN (5 bits == 32)
87 * 5: STR_SHARED_ROOT (RSTRING_NOEMBED==1 && STR_SHARED == 0, there may be
88 * other strings that rely on this string's buffer)
89 * 6: STR_BORROWED (when RSTRING_NOEMBED==1 && klass==0, unsafe to recycle
90 * early, specific to rb_str_tmp_frozen_{acquire,release})
91 * 7: STR_TMPLOCK (set when a pointer to the buffer is passed to syscall
92 * such as read(2). Any modification and realloc is prohibited)
94 * 8-9: ENC_CODERANGE (2 bits)
95 * 10-16: ENCODING (7 bits == 128)
96 * 17: RSTRING_FSTR
97 * 18: STR_NOFREE (do not free this string's buffer when a String is freed.
98 * used for a string object based on C string literal)
99 * 19: STR_FAKESTR (when RVALUE is not managed by GC. Typically, the string
100 * object header is temporarily allocated on C stack)
103 #define RUBY_MAX_CHAR_LEN 16
104 #define STR_SHARED_ROOT FL_USER5
105 #define STR_BORROWED FL_USER6
106 #define STR_TMPLOCK FL_USER7
107 #define STR_NOFREE FL_USER18
108 #define STR_FAKESTR FL_USER19
110 #define STR_SET_NOEMBED(str) do {\
111 FL_SET((str), STR_NOEMBED);\
112 if (USE_RVARGC) {\
113 FL_UNSET((str), STR_SHARED | STR_SHARED_ROOT | STR_BORROWED);\
115 else {\
116 STR_SET_EMBED_LEN((str), 0);\
118 } while (0)
119 #define STR_SET_EMBED(str) FL_UNSET((str), (STR_NOEMBED|STR_NOFREE))
120 #if USE_RVARGC
121 # define STR_SET_EMBED_LEN(str, n) do { \
122 assert(str_embed_capa(str) > (n));\
123 RSTRING(str)->as.embed.len = (n);\
124 } while (0)
125 #else
126 # define STR_SET_EMBED_LEN(str, n) do { \
127 long tmp_n = (n);\
128 RBASIC(str)->flags &= ~RSTRING_EMBED_LEN_MASK;\
129 RBASIC(str)->flags |= (tmp_n) << RSTRING_EMBED_LEN_SHIFT;\
130 } while (0)
131 #endif
133 #define STR_SET_LEN(str, n) do { \
134 if (STR_EMBED_P(str)) {\
135 STR_SET_EMBED_LEN((str), (n));\
137 else {\
138 RSTRING(str)->as.heap.len = (n);\
140 } while (0)
142 #define STR_DEC_LEN(str) do {\
143 if (STR_EMBED_P(str)) {\
144 long n = RSTRING_LEN(str);\
145 n--;\
146 STR_SET_EMBED_LEN((str), n);\
148 else {\
149 RSTRING(str)->as.heap.len--;\
151 } while (0)
153 #define TERM_LEN(str) rb_enc_mbminlen(rb_enc_get(str))
154 #define TERM_FILL(ptr, termlen) do {\
155 char *const term_fill_ptr = (ptr);\
156 const int term_fill_len = (termlen);\
157 *term_fill_ptr = '\0';\
158 if (UNLIKELY(term_fill_len > 1))\
159 memset(term_fill_ptr, 0, term_fill_len);\
160 } while (0)
162 #define RESIZE_CAPA(str,capacity) do {\
163 const int termlen = TERM_LEN(str);\
164 RESIZE_CAPA_TERM(str,capacity,termlen);\
165 } while (0)
166 #define RESIZE_CAPA_TERM(str,capacity,termlen) do {\
167 if (STR_EMBED_P(str)) {\
168 if (str_embed_capa(str) < capacity + termlen) {\
169 char *const tmp = ALLOC_N(char, (size_t)(capacity) + (termlen));\
170 const long tlen = RSTRING_LEN(str);\
171 memcpy(tmp, RSTRING_PTR(str), tlen);\
172 RSTRING(str)->as.heap.ptr = tmp;\
173 RSTRING(str)->as.heap.len = tlen;\
174 STR_SET_NOEMBED(str);\
175 RSTRING(str)->as.heap.aux.capa = (capacity);\
178 else {\
179 assert(!FL_TEST((str), STR_SHARED)); \
180 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char, \
181 (size_t)(capacity) + (termlen), STR_HEAP_SIZE(str)); \
182 RSTRING(str)->as.heap.aux.capa = (capacity);\
184 } while (0)
186 #define STR_SET_SHARED(str, shared_str) do { \
187 if (!FL_TEST(str, STR_FAKESTR)) { \
188 assert(RSTRING_PTR(shared_str) <= RSTRING_PTR(str)); \
189 assert(RSTRING_PTR(str) <= RSTRING_PTR(shared_str) + RSTRING_LEN(shared_str)); \
190 RB_OBJ_WRITE((str), &RSTRING(str)->as.heap.aux.shared, (shared_str)); \
191 FL_SET((str), STR_SHARED); \
192 FL_SET((shared_str), STR_SHARED_ROOT); \
193 if (RBASIC_CLASS((shared_str)) == 0) /* for CoW-friendliness */ \
194 FL_SET_RAW((shared_str), STR_BORROWED); \
196 } while (0)
198 #define STR_HEAP_PTR(str) (RSTRING(str)->as.heap.ptr)
199 #define STR_HEAP_SIZE(str) ((size_t)RSTRING(str)->as.heap.aux.capa + TERM_LEN(str))
200 /* TODO: include the terminator size in capa. */
202 #define STR_ENC_GET(str) get_encoding(str)
204 #if !defined SHARABLE_MIDDLE_SUBSTRING
205 # define SHARABLE_MIDDLE_SUBSTRING 0
206 #endif
207 #if !SHARABLE_MIDDLE_SUBSTRING
208 #define SHARABLE_SUBSTRING_P(beg, len, end) ((beg) + (len) == (end))
209 #else
210 #define SHARABLE_SUBSTRING_P(beg, len, end) 1
211 #endif
214 static inline long
215 str_embed_capa(VALUE str)
217 #if USE_RVARGC
218 return rb_gc_obj_slot_size(str) - offsetof(struct RString, as.embed.ary);
219 #else
220 return RSTRING_EMBED_LEN_MAX + 1;
221 #endif
224 static inline size_t
225 str_embed_size(long capa)
227 return offsetof(struct RString, as.embed.ary) + capa;
230 static inline bool
231 STR_EMBEDDABLE_P(long len, long termlen)
233 #if USE_RVARGC
234 return rb_gc_size_allocatable_p(str_embed_size(len + termlen));
235 #else
236 return len <= RSTRING_EMBED_LEN_MAX + 1 - termlen;
237 #endif
240 static VALUE str_replace_shared_without_enc(VALUE str2, VALUE str);
241 static VALUE str_new_frozen(VALUE klass, VALUE orig);
242 static VALUE str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding);
243 static VALUE str_new_static(VALUE klass, const char *ptr, long len, int encindex);
244 static VALUE str_new(VALUE klass, const char *ptr, long len);
245 static void str_make_independent_expand(VALUE str, long len, long expand, const int termlen);
246 static inline void str_modifiable(VALUE str);
247 static VALUE rb_str_downcase(int argc, VALUE *argv, VALUE str);
249 static inline void
250 str_make_independent(VALUE str)
252 long len = RSTRING_LEN(str);
253 int termlen = TERM_LEN(str);
254 str_make_independent_expand((str), len, 0L, termlen);
257 static inline int str_dependent_p(VALUE str);
259 void
260 rb_str_make_independent(VALUE str)
262 if (str_dependent_p(str)) {
263 str_make_independent(str);
267 void
268 rb_debug_rstring_null_ptr(const char *func)
270 fprintf(stderr, "%s is returning NULL!! "
271 "SIGSEGV is highly expected to follow immediately. "
272 "If you could reproduce, attach your debugger here, "
273 "and look at the passed string.",
274 func);
277 /* symbols for [up|down|swap]case/capitalize options */
278 static VALUE sym_ascii, sym_turkic, sym_lithuanian, sym_fold;
280 static rb_encoding *
281 get_actual_encoding(const int encidx, VALUE str)
283 const unsigned char *q;
285 switch (encidx) {
286 case ENCINDEX_UTF_16:
287 if (RSTRING_LEN(str) < 2) break;
288 q = (const unsigned char *)RSTRING_PTR(str);
289 if (q[0] == 0xFE && q[1] == 0xFF) {
290 return rb_enc_get_from_index(ENCINDEX_UTF_16BE);
292 if (q[0] == 0xFF && q[1] == 0xFE) {
293 return rb_enc_get_from_index(ENCINDEX_UTF_16LE);
295 return rb_ascii8bit_encoding();
296 case ENCINDEX_UTF_32:
297 if (RSTRING_LEN(str) < 4) break;
298 q = (const unsigned char *)RSTRING_PTR(str);
299 if (q[0] == 0 && q[1] == 0 && q[2] == 0xFE && q[3] == 0xFF) {
300 return rb_enc_get_from_index(ENCINDEX_UTF_32BE);
302 if (q[3] == 0 && q[2] == 0 && q[1] == 0xFE && q[0] == 0xFF) {
303 return rb_enc_get_from_index(ENCINDEX_UTF_32LE);
305 return rb_ascii8bit_encoding();
307 return rb_enc_from_index(encidx);
310 static rb_encoding *
311 get_encoding(VALUE str)
313 return get_actual_encoding(ENCODING_GET(str), str);
316 static void
317 mustnot_broken(VALUE str)
319 if (is_broken_string(str)) {
320 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
324 static void
325 mustnot_wchar(VALUE str)
327 rb_encoding *enc = STR_ENC_GET(str);
328 if (rb_enc_mbminlen(enc) > 1) {
329 rb_raise(rb_eArgError, "wide char encoding: %s", rb_enc_name(enc));
333 static int fstring_cmp(VALUE a, VALUE b);
335 static VALUE register_fstring(VALUE str, bool copy);
337 const struct st_hash_type rb_fstring_hash_type = {
338 fstring_cmp,
339 rb_str_hash,
342 #define BARE_STRING_P(str) (!FL_ANY_RAW(str, FL_EXIVAR) && RBASIC_CLASS(str) == rb_cString)
344 struct fstr_update_arg {
345 VALUE fstr;
346 bool copy;
349 static int
350 fstr_update_callback(st_data_t *key, st_data_t *value, st_data_t data, int existing)
353 struct fstr_update_arg *arg = (struct fstr_update_arg *)data;
354 VALUE str = (VALUE)*key;
356 if (existing) {
357 /* because of lazy sweep, str may be unmarked already and swept
358 * at next time */
360 if (rb_objspace_garbage_object_p(str)) {
361 arg->fstr = Qundef;
362 return ST_DELETE;
365 arg->fstr = str;
366 return ST_STOP;
368 else {
369 if (FL_TEST_RAW(str, STR_FAKESTR)) {
370 if (arg->copy) {
371 VALUE new_str = str_new(rb_cString, RSTRING(str)->as.heap.ptr, RSTRING(str)->as.heap.len);
372 rb_enc_copy(new_str, str);
373 str = new_str;
375 else {
376 str = str_new_static(rb_cString, RSTRING(str)->as.heap.ptr,
377 RSTRING(str)->as.heap.len,
378 ENCODING_GET(str));
380 OBJ_FREEZE_RAW(str);
382 else {
383 if (!OBJ_FROZEN(str))
384 str = str_new_frozen(rb_cString, str);
385 if (STR_SHARED_P(str)) { /* str should not be shared */
386 /* shared substring */
387 str_make_independent(str);
388 assert(OBJ_FROZEN(str));
390 if (!BARE_STRING_P(str)) {
391 str = str_new_frozen(rb_cString, str);
394 RBASIC(str)->flags |= RSTRING_FSTR;
396 *key = *value = arg->fstr = str;
397 return ST_CONTINUE;
401 RUBY_FUNC_EXPORTED
402 VALUE
403 rb_fstring(VALUE str)
405 VALUE fstr;
406 int bare;
408 Check_Type(str, T_STRING);
410 if (FL_TEST(str, RSTRING_FSTR))
411 return str;
413 bare = BARE_STRING_P(str);
414 if (!bare) {
415 if (STR_EMBED_P(str)) {
416 OBJ_FREEZE_RAW(str);
417 return str;
419 if (FL_TEST_RAW(str, STR_NOEMBED|STR_SHARED_ROOT|STR_SHARED) == (STR_NOEMBED|STR_SHARED_ROOT)) {
420 assert(OBJ_FROZEN(str));
421 return str;
425 if (!OBJ_FROZEN(str))
426 rb_str_resize(str, RSTRING_LEN(str));
428 fstr = register_fstring(str, FALSE);
430 if (!bare) {
431 str_replace_shared_without_enc(str, fstr);
432 OBJ_FREEZE_RAW(str);
433 return str;
435 return fstr;
438 static VALUE
439 register_fstring(VALUE str, bool copy)
441 struct fstr_update_arg args;
442 args.copy = copy;
444 RB_VM_LOCK_ENTER();
446 st_table *frozen_strings = rb_vm_fstring_table();
447 do {
448 args.fstr = str;
449 st_update(frozen_strings, (st_data_t)str, fstr_update_callback, (st_data_t)&args);
450 } while (args.fstr == Qundef);
452 RB_VM_LOCK_LEAVE();
454 assert(OBJ_FROZEN(args.fstr));
455 assert(!FL_TEST_RAW(args.fstr, STR_FAKESTR));
456 assert(!FL_TEST_RAW(args.fstr, FL_EXIVAR));
457 assert(RBASIC_CLASS(args.fstr) == rb_cString);
458 return args.fstr;
461 static VALUE
462 setup_fake_str(struct RString *fake_str, const char *name, long len, int encidx)
464 fake_str->basic.flags = T_STRING|RSTRING_NOEMBED|STR_NOFREE|STR_FAKESTR;
465 /* SHARED to be allocated by the callback */
467 if (!name) {
468 RUBY_ASSERT_ALWAYS(len == 0);
469 name = "";
472 ENCODING_SET_INLINED((VALUE)fake_str, encidx);
474 RBASIC_SET_CLASS_RAW((VALUE)fake_str, rb_cString);
475 fake_str->as.heap.len = len;
476 fake_str->as.heap.ptr = (char *)name;
477 fake_str->as.heap.aux.capa = len;
478 return (VALUE)fake_str;
482 * set up a fake string which refers a static string literal.
484 VALUE
485 rb_setup_fake_str(struct RString *fake_str, const char *name, long len, rb_encoding *enc)
487 return setup_fake_str(fake_str, name, len, rb_enc_to_index(enc));
491 * rb_fstring_new and rb_fstring_cstr family create or lookup a frozen
492 * shared string which refers a static string literal. `ptr` must
493 * point a constant string.
495 MJIT_FUNC_EXPORTED VALUE
496 rb_fstring_new(const char *ptr, long len)
498 struct RString fake_str;
499 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), FALSE);
502 VALUE
503 rb_fstring_enc_new(const char *ptr, long len, rb_encoding *enc)
505 struct RString fake_str;
506 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), FALSE);
509 VALUE
510 rb_fstring_cstr(const char *ptr)
512 return rb_fstring_new(ptr, strlen(ptr));
515 static int
516 fstring_set_class_i(st_data_t key, st_data_t val, st_data_t arg)
518 RBASIC_SET_CLASS((VALUE)key, (VALUE)arg);
519 return ST_CONTINUE;
522 static int
523 fstring_cmp(VALUE a, VALUE b)
525 long alen, blen;
526 const char *aptr, *bptr;
527 RSTRING_GETMEM(a, aptr, alen);
528 RSTRING_GETMEM(b, bptr, blen);
529 return (alen != blen ||
530 ENCODING_GET(a) != ENCODING_GET(b) ||
531 memcmp(aptr, bptr, alen) != 0);
534 static inline int
535 single_byte_optimizable(VALUE str)
537 rb_encoding *enc;
539 /* Conservative. It may be ENC_CODERANGE_UNKNOWN. */
540 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT)
541 return 1;
543 enc = STR_ENC_GET(str);
544 if (rb_enc_mbmaxlen(enc) == 1)
545 return 1;
547 /* Conservative. Possibly single byte.
548 * "\xa1" in Shift_JIS for example. */
549 return 0;
552 VALUE rb_fs;
554 static inline const char *
555 search_nonascii(const char *p, const char *e)
557 const uintptr_t *s, *t;
559 #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
560 # if SIZEOF_UINTPTR_T == 8
561 # define NONASCII_MASK UINT64_C(0x8080808080808080)
562 # elif SIZEOF_UINTPTR_T == 4
563 # define NONASCII_MASK UINT32_C(0x80808080)
564 # else
565 # error "don't know what to do."
566 # endif
567 #else
568 # if SIZEOF_UINTPTR_T == 8
569 # define NONASCII_MASK ((uintptr_t)0x80808080UL << 32 | (uintptr_t)0x80808080UL)
570 # elif SIZEOF_UINTPTR_T == 4
571 # define NONASCII_MASK 0x80808080UL /* or...? */
572 # else
573 # error "don't know what to do."
574 # endif
575 #endif
577 if (UNALIGNED_WORD_ACCESS || e - p >= SIZEOF_VOIDP) {
578 #if !UNALIGNED_WORD_ACCESS
579 if ((uintptr_t)p % SIZEOF_VOIDP) {
580 int l = SIZEOF_VOIDP - (uintptr_t)p % SIZEOF_VOIDP;
581 p += l;
582 switch (l) {
583 default: UNREACHABLE;
584 #if SIZEOF_VOIDP > 4
585 case 7: if (p[-7]&0x80) return p-7;
586 case 6: if (p[-6]&0x80) return p-6;
587 case 5: if (p[-5]&0x80) return p-5;
588 case 4: if (p[-4]&0x80) return p-4;
589 #endif
590 case 3: if (p[-3]&0x80) return p-3;
591 case 2: if (p[-2]&0x80) return p-2;
592 case 1: if (p[-1]&0x80) return p-1;
593 case 0: break;
596 #endif
597 #if defined(HAVE_BUILTIN___BUILTIN_ASSUME_ALIGNED) &&! UNALIGNED_WORD_ACCESS
598 #define aligned_ptr(value) \
599 __builtin_assume_aligned((value), sizeof(uintptr_t))
600 #else
601 #define aligned_ptr(value) (uintptr_t *)(value)
602 #endif
603 s = aligned_ptr(p);
604 t = (uintptr_t *)(e - (SIZEOF_VOIDP-1));
605 #undef aligned_ptr
606 for (;s < t; s++) {
607 if (*s & NONASCII_MASK) {
608 #ifdef WORDS_BIGENDIAN
609 return (const char *)s + (nlz_intptr(*s&NONASCII_MASK)>>3);
610 #else
611 return (const char *)s + (ntz_intptr(*s&NONASCII_MASK)>>3);
612 #endif
615 p = (const char *)s;
618 switch (e - p) {
619 default: UNREACHABLE;
620 #if SIZEOF_VOIDP > 4
621 case 7: if (e[-7]&0x80) return e-7;
622 case 6: if (e[-6]&0x80) return e-6;
623 case 5: if (e[-5]&0x80) return e-5;
624 case 4: if (e[-4]&0x80) return e-4;
625 #endif
626 case 3: if (e[-3]&0x80) return e-3;
627 case 2: if (e[-2]&0x80) return e-2;
628 case 1: if (e[-1]&0x80) return e-1;
629 case 0: return NULL;
633 static int
634 coderange_scan(const char *p, long len, rb_encoding *enc)
636 const char *e = p + len;
638 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
639 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
640 p = search_nonascii(p, e);
641 return p ? ENC_CODERANGE_VALID : ENC_CODERANGE_7BIT;
644 if (rb_enc_asciicompat(enc)) {
645 p = search_nonascii(p, e);
646 if (!p) return ENC_CODERANGE_7BIT;
647 for (;;) {
648 int ret = rb_enc_precise_mbclen(p, e, enc);
649 if (!MBCLEN_CHARFOUND_P(ret)) return ENC_CODERANGE_BROKEN;
650 p += MBCLEN_CHARFOUND_LEN(ret);
651 if (p == e) break;
652 p = search_nonascii(p, e);
653 if (!p) break;
656 else {
657 while (p < e) {
658 int ret = rb_enc_precise_mbclen(p, e, enc);
659 if (!MBCLEN_CHARFOUND_P(ret)) return ENC_CODERANGE_BROKEN;
660 p += MBCLEN_CHARFOUND_LEN(ret);
663 return ENC_CODERANGE_VALID;
666 long
667 rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
669 const char *p = s;
671 if (*cr == ENC_CODERANGE_BROKEN)
672 return e - s;
674 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
675 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
676 if (*cr == ENC_CODERANGE_VALID) return e - s;
677 p = search_nonascii(p, e);
678 *cr = p ? ENC_CODERANGE_VALID : ENC_CODERANGE_7BIT;
679 return e - s;
681 else if (rb_enc_asciicompat(enc)) {
682 p = search_nonascii(p, e);
683 if (!p) {
684 if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
685 return e - s;
687 for (;;) {
688 int ret = rb_enc_precise_mbclen(p, e, enc);
689 if (!MBCLEN_CHARFOUND_P(ret)) {
690 *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN;
691 return p - s;
693 p += MBCLEN_CHARFOUND_LEN(ret);
694 if (p == e) break;
695 p = search_nonascii(p, e);
696 if (!p) break;
699 else {
700 while (p < e) {
701 int ret = rb_enc_precise_mbclen(p, e, enc);
702 if (!MBCLEN_CHARFOUND_P(ret)) {
703 *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN;
704 return p - s;
706 p += MBCLEN_CHARFOUND_LEN(ret);
709 *cr = ENC_CODERANGE_VALID;
710 return e - s;
713 static inline void
714 str_enc_copy(VALUE str1, VALUE str2)
716 rb_enc_set_index(str1, ENCODING_GET(str2));
719 static void
720 rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
722 /* this function is designed for copying encoding and coderange
723 * from src to new string "dest" which is made from the part of src.
725 str_enc_copy(dest, src);
726 if (RSTRING_LEN(dest) == 0) {
727 if (!rb_enc_asciicompat(STR_ENC_GET(src)))
728 ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID);
729 else
730 ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
731 return;
733 switch (ENC_CODERANGE(src)) {
734 case ENC_CODERANGE_7BIT:
735 ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
736 break;
737 case ENC_CODERANGE_VALID:
738 if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
739 search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
740 ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID);
741 else
742 ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
743 break;
744 default:
745 break;
749 static void
750 rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
752 str_enc_copy(dest, src);
753 ENC_CODERANGE_SET(dest, ENC_CODERANGE(src));
756 static int
757 enc_coderange_scan(VALUE str, rb_encoding *enc, int encidx)
759 if (rb_enc_mbminlen(enc) > 1 && rb_enc_dummy_p(enc) &&
760 rb_enc_mbminlen(enc = get_actual_encoding(encidx, str)) == 1) {
761 return ENC_CODERANGE_BROKEN;
763 else {
764 return coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
769 rb_enc_str_coderange_scan(VALUE str, rb_encoding *enc)
771 return enc_coderange_scan(str, enc, rb_enc_to_index(enc));
775 rb_enc_str_coderange(VALUE str)
777 int cr = ENC_CODERANGE(str);
779 if (cr == ENC_CODERANGE_UNKNOWN) {
780 int encidx = ENCODING_GET(str);
781 rb_encoding *enc = rb_enc_from_index(encidx);
782 cr = enc_coderange_scan(str, enc, encidx);
783 ENC_CODERANGE_SET(str, cr);
785 return cr;
789 rb_enc_str_asciionly_p(VALUE str)
791 rb_encoding *enc = STR_ENC_GET(str);
793 if (!rb_enc_asciicompat(enc))
794 return FALSE;
795 else if (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT)
796 return TRUE;
797 return FALSE;
800 static inline void
801 str_mod_check(VALUE s, const char *p, long len)
803 if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
804 rb_raise(rb_eRuntimeError, "string modified");
808 static size_t
809 str_capacity(VALUE str, const int termlen)
811 if (STR_EMBED_P(str)) {
812 #if USE_RVARGC
813 return str_embed_capa(str) - termlen;
814 #else
815 return (RSTRING_EMBED_LEN_MAX + 1 - termlen);
816 #endif
818 else if (FL_TEST(str, STR_SHARED|STR_NOFREE)) {
819 return RSTRING(str)->as.heap.len;
821 else {
822 return RSTRING(str)->as.heap.aux.capa;
826 size_t
827 rb_str_capacity(VALUE str)
829 return str_capacity(str, TERM_LEN(str));
832 static inline void
833 must_not_null(const char *ptr)
835 if (!ptr) {
836 rb_raise(rb_eArgError, "NULL pointer given");
840 static inline VALUE
841 str_alloc(VALUE klass, size_t size)
843 assert(size > 0);
844 RVARGC_NEWOBJ_OF(str, struct RString, klass,
845 T_STRING | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), size);
846 return (VALUE)str;
849 static inline VALUE
850 str_alloc_embed(VALUE klass, size_t capa)
852 size_t size = str_embed_size(capa);
853 assert(rb_gc_size_allocatable_p(size));
854 #if !USE_RVARGC
855 assert(size <= sizeof(struct RString));
856 #endif
857 return str_alloc(klass, size);
860 static inline VALUE
861 str_alloc_heap(VALUE klass)
863 return str_alloc(klass, sizeof(struct RString));
866 static inline VALUE
867 empty_str_alloc(VALUE klass)
869 RUBY_DTRACE_CREATE_HOOK(STRING, 0);
870 VALUE str = str_alloc_embed(klass, 0);
871 memset(RSTRING(str)->as.embed.ary, 0, str_embed_capa(str));
872 return str;
875 static VALUE
876 str_new0(VALUE klass, const char *ptr, long len, int termlen)
878 VALUE str;
880 if (len < 0) {
881 rb_raise(rb_eArgError, "negative string size (or size too big)");
884 RUBY_DTRACE_CREATE_HOOK(STRING, len);
886 if (STR_EMBEDDABLE_P(len, termlen)) {
887 str = str_alloc_embed(klass, len + termlen);
888 if (len == 0) {
889 ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
892 else {
893 str = str_alloc_heap(klass);
894 RSTRING(str)->as.heap.aux.capa = len;
895 /* :FIXME: @shyouhei guesses `len + termlen` is guaranteed to never
896 * integer overflow. If we can STATIC_ASSERT that, the following
897 * mul_add_mul can be reverted to a simple ALLOC_N. */
898 RSTRING(str)->as.heap.ptr =
899 rb_xmalloc_mul_add_mul(sizeof(char), len, sizeof(char), termlen);
900 STR_SET_NOEMBED(str);
902 if (ptr) {
903 memcpy(RSTRING_PTR(str), ptr, len);
905 STR_SET_LEN(str, len);
906 TERM_FILL(RSTRING_PTR(str) + len, termlen);
907 return str;
910 static VALUE
911 str_new(VALUE klass, const char *ptr, long len)
913 return str_new0(klass, ptr, len, 1);
916 VALUE
917 rb_str_new(const char *ptr, long len)
919 return str_new(rb_cString, ptr, len);
922 VALUE
923 rb_usascii_str_new(const char *ptr, long len)
925 VALUE str = rb_str_new(ptr, len);
926 ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
927 return str;
930 VALUE
931 rb_utf8_str_new(const char *ptr, long len)
933 VALUE str = str_new(rb_cString, ptr, len);
934 rb_enc_associate_index(str, rb_utf8_encindex());
935 return str;
938 VALUE
939 rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
941 VALUE str;
943 if (!enc) return rb_str_new(ptr, len);
945 str = str_new0(rb_cString, ptr, len, rb_enc_mbminlen(enc));
946 rb_enc_associate(str, enc);
947 return str;
950 VALUE
951 rb_str_new_cstr(const char *ptr)
953 must_not_null(ptr);
954 /* rb_str_new_cstr() can take pointer from non-malloc-generated
955 * memory regions, and that cannot be detected by the MSAN. Just
956 * trust the programmer that the argument passed here is a sane C
957 * string. */
958 __msan_unpoison_string(ptr);
959 return rb_str_new(ptr, strlen(ptr));
962 VALUE
963 rb_usascii_str_new_cstr(const char *ptr)
965 VALUE str = rb_str_new_cstr(ptr);
966 ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
967 return str;
970 VALUE
971 rb_utf8_str_new_cstr(const char *ptr)
973 VALUE str = rb_str_new_cstr(ptr);
974 rb_enc_associate_index(str, rb_utf8_encindex());
975 return str;
978 VALUE
979 rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc)
981 must_not_null(ptr);
982 if (rb_enc_mbminlen(enc) != 1) {
983 rb_raise(rb_eArgError, "wchar encoding given");
985 return rb_enc_str_new(ptr, strlen(ptr), enc);
988 static VALUE
989 str_new_static(VALUE klass, const char *ptr, long len, int encindex)
991 VALUE str;
993 if (len < 0) {
994 rb_raise(rb_eArgError, "negative string size (or size too big)");
997 if (!ptr) {
998 rb_encoding *enc = rb_enc_get_from_index(encindex);
999 str = str_new0(klass, ptr, len, rb_enc_mbminlen(enc));
1001 else {
1002 RUBY_DTRACE_CREATE_HOOK(STRING, len);
1003 str = str_alloc_heap(klass);
1004 RSTRING(str)->as.heap.len = len;
1005 RSTRING(str)->as.heap.ptr = (char *)ptr;
1006 RSTRING(str)->as.heap.aux.capa = len;
1007 STR_SET_NOEMBED(str);
1008 RBASIC(str)->flags |= STR_NOFREE;
1010 rb_enc_associate_index(str, encindex);
1011 return str;
1014 VALUE
1015 rb_str_new_static(const char *ptr, long len)
1017 return str_new_static(rb_cString, ptr, len, 0);
1020 VALUE
1021 rb_usascii_str_new_static(const char *ptr, long len)
1023 return str_new_static(rb_cString, ptr, len, ENCINDEX_US_ASCII);
1026 VALUE
1027 rb_utf8_str_new_static(const char *ptr, long len)
1029 return str_new_static(rb_cString, ptr, len, ENCINDEX_UTF_8);
1032 VALUE
1033 rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
1035 return str_new_static(rb_cString, ptr, len, rb_enc_to_index(enc));
1038 static VALUE str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1039 rb_encoding *from, rb_encoding *to,
1040 int ecflags, VALUE ecopts);
1042 static inline bool
1043 is_enc_ascii_string(VALUE str, rb_encoding *enc)
1045 int encidx = rb_enc_to_index(enc);
1046 if (rb_enc_get_index(str) == encidx)
1047 return is_ascii_string(str);
1048 return enc_coderange_scan(str, enc, encidx) == ENC_CODERANGE_7BIT;
1051 VALUE
1052 rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
1054 long len;
1055 const char *ptr;
1056 VALUE newstr;
1058 if (!to) return str;
1059 if (!from) from = rb_enc_get(str);
1060 if (from == to) return str;
1061 if ((rb_enc_asciicompat(to) && is_enc_ascii_string(str, from)) ||
1062 to == rb_ascii8bit_encoding()) {
1063 if (STR_ENC_GET(str) != to) {
1064 str = rb_str_dup(str);
1065 rb_enc_associate(str, to);
1067 return str;
1070 RSTRING_GETMEM(str, ptr, len);
1071 newstr = str_cat_conv_enc_opts(rb_str_buf_new(len), 0, ptr, len,
1072 from, to, ecflags, ecopts);
1073 if (NIL_P(newstr)) {
1074 /* some error, return original */
1075 return str;
1077 return newstr;
1080 VALUE
1081 rb_str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1082 rb_encoding *from, int ecflags, VALUE ecopts)
1084 long olen;
1086 olen = RSTRING_LEN(newstr);
1087 if (ofs < -olen || olen < ofs)
1088 rb_raise(rb_eIndexError, "index %ld out of string", ofs);
1089 if (ofs < 0) ofs += olen;
1090 if (!from) {
1091 STR_SET_LEN(newstr, ofs);
1092 return rb_str_cat(newstr, ptr, len);
1095 rb_str_modify(newstr);
1096 return str_cat_conv_enc_opts(newstr, ofs, ptr, len, from,
1097 rb_enc_get(newstr),
1098 ecflags, ecopts);
1101 VALUE
1102 rb_str_initialize(VALUE str, const char *ptr, long len, rb_encoding *enc)
1104 STR_SET_LEN(str, 0);
1105 rb_enc_associate(str, enc);
1106 rb_str_cat(str, ptr, len);
1107 return str;
1110 static VALUE
1111 str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1112 rb_encoding *from, rb_encoding *to,
1113 int ecflags, VALUE ecopts)
1115 rb_econv_t *ec;
1116 rb_econv_result_t ret;
1117 long olen;
1118 VALUE econv_wrapper;
1119 const unsigned char *start, *sp;
1120 unsigned char *dest, *dp;
1121 size_t converted_output = (size_t)ofs;
1123 olen = rb_str_capacity(newstr);
1125 econv_wrapper = rb_obj_alloc(rb_cEncodingConverter);
1126 RBASIC_CLEAR_CLASS(econv_wrapper);
1127 ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
1128 if (!ec) return Qnil;
1129 DATA_PTR(econv_wrapper) = ec;
1131 sp = (unsigned char*)ptr;
1132 start = sp;
1133 while ((dest = (unsigned char*)RSTRING_PTR(newstr)),
1134 (dp = dest + converted_output),
1135 (ret = rb_econv_convert(ec, &sp, start + len, &dp, dest + olen, 0)),
1136 ret == econv_destination_buffer_full) {
1137 /* destination buffer short */
1138 size_t converted_input = sp - start;
1139 size_t rest = len - converted_input;
1140 converted_output = dp - dest;
1141 rb_str_set_len(newstr, converted_output);
1142 if (converted_input && converted_output &&
1143 rest < (LONG_MAX / converted_output)) {
1144 rest = (rest * converted_output) / converted_input;
1146 else {
1147 rest = olen;
1149 olen += rest < 2 ? 2 : rest;
1150 rb_str_resize(newstr, olen);
1152 DATA_PTR(econv_wrapper) = 0;
1153 rb_econv_close(ec);
1154 switch (ret) {
1155 case econv_finished:
1156 len = dp - (unsigned char*)RSTRING_PTR(newstr);
1157 rb_str_set_len(newstr, len);
1158 rb_enc_associate(newstr, to);
1159 return newstr;
1161 default:
1162 return Qnil;
1166 VALUE
1167 rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
1169 return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
1172 VALUE
1173 rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *eenc)
1175 rb_encoding *ienc;
1176 VALUE str;
1177 const int eidx = rb_enc_to_index(eenc);
1179 if (!ptr) {
1180 return rb_enc_str_new(ptr, len, eenc);
1183 /* ASCII-8BIT case, no conversion */
1184 if ((eidx == rb_ascii8bit_encindex()) ||
1185 (eidx == rb_usascii_encindex() && search_nonascii(ptr, ptr + len))) {
1186 return rb_str_new(ptr, len);
1188 /* no default_internal or same encoding, no conversion */
1189 ienc = rb_default_internal_encoding();
1190 if (!ienc || eenc == ienc) {
1191 return rb_enc_str_new(ptr, len, eenc);
1193 /* ASCII compatible, and ASCII only string, no conversion in
1194 * default_internal */
1195 if ((eidx == rb_ascii8bit_encindex()) ||
1196 (eidx == rb_usascii_encindex()) ||
1197 (rb_enc_asciicompat(eenc) && !search_nonascii(ptr, ptr + len))) {
1198 return rb_enc_str_new(ptr, len, ienc);
1200 /* convert from the given encoding to default_internal */
1201 str = rb_enc_str_new(NULL, 0, ienc);
1202 /* when the conversion failed for some reason, just ignore the
1203 * default_internal and result in the given encoding as-is. */
1204 if (NIL_P(rb_str_cat_conv_enc_opts(str, 0, ptr, len, eenc, 0, Qnil))) {
1205 rb_str_initialize(str, ptr, len, eenc);
1207 return str;
1210 VALUE
1211 rb_external_str_with_enc(VALUE str, rb_encoding *eenc)
1213 int eidx = rb_enc_to_index(eenc);
1214 if (eidx == rb_usascii_encindex() &&
1215 rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT) {
1216 rb_enc_associate_index(str, rb_ascii8bit_encindex());
1217 return str;
1219 rb_enc_associate_index(str, eidx);
1220 return rb_str_conv_enc(str, eenc, rb_default_internal_encoding());
1223 VALUE
1224 rb_external_str_new(const char *ptr, long len)
1226 return rb_external_str_new_with_enc(ptr, len, rb_default_external_encoding());
1229 VALUE
1230 rb_external_str_new_cstr(const char *ptr)
1232 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_default_external_encoding());
1235 VALUE
1236 rb_locale_str_new(const char *ptr, long len)
1238 return rb_external_str_new_with_enc(ptr, len, rb_locale_encoding());
1241 VALUE
1242 rb_locale_str_new_cstr(const char *ptr)
1244 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_locale_encoding());
1247 VALUE
1248 rb_filesystem_str_new(const char *ptr, long len)
1250 return rb_external_str_new_with_enc(ptr, len, rb_filesystem_encoding());
1253 VALUE
1254 rb_filesystem_str_new_cstr(const char *ptr)
1256 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_filesystem_encoding());
1259 VALUE
1260 rb_str_export(VALUE str)
1262 return rb_str_export_to_enc(str, rb_default_external_encoding());
1265 VALUE
1266 rb_str_export_locale(VALUE str)
1268 return rb_str_export_to_enc(str, rb_locale_encoding());
1271 VALUE
1272 rb_str_export_to_enc(VALUE str, rb_encoding *enc)
1274 return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
1277 static VALUE
1278 str_replace_shared_without_enc(VALUE str2, VALUE str)
1280 const int termlen = TERM_LEN(str);
1281 char *ptr;
1282 long len;
1284 RSTRING_GETMEM(str, ptr, len);
1285 if (str_embed_capa(str2) >= len + termlen) {
1286 char *ptr2 = RSTRING(str2)->as.embed.ary;
1287 STR_SET_EMBED(str2);
1288 memcpy(ptr2, RSTRING_PTR(str), len);
1289 STR_SET_EMBED_LEN(str2, len);
1290 TERM_FILL(ptr2+len, termlen);
1292 else {
1293 VALUE root;
1294 if (STR_SHARED_P(str)) {
1295 root = RSTRING(str)->as.heap.aux.shared;
1296 RSTRING_GETMEM(str, ptr, len);
1298 else {
1299 root = rb_str_new_frozen(str);
1300 RSTRING_GETMEM(root, ptr, len);
1302 assert(OBJ_FROZEN(root));
1303 if (!STR_EMBED_P(str2) && !FL_TEST_RAW(str2, STR_SHARED|STR_NOFREE)) {
1304 if (FL_TEST_RAW(str2, STR_SHARED_ROOT)) {
1305 rb_fatal("about to free a possible shared root");
1307 char *ptr2 = STR_HEAP_PTR(str2);
1308 if (ptr2 != ptr) {
1309 ruby_sized_xfree(ptr2, STR_HEAP_SIZE(str2));
1312 FL_SET(str2, STR_NOEMBED);
1313 RSTRING(str2)->as.heap.len = len;
1314 RSTRING(str2)->as.heap.ptr = ptr;
1315 STR_SET_SHARED(str2, root);
1317 return str2;
1320 static VALUE
1321 str_replace_shared(VALUE str2, VALUE str)
1323 str_replace_shared_without_enc(str2, str);
1324 rb_enc_cr_str_exact_copy(str2, str);
1325 return str2;
1328 static VALUE
1329 str_new_shared(VALUE klass, VALUE str)
1331 return str_replace_shared(str_alloc_heap(klass), str);
1334 VALUE
1335 rb_str_new_shared(VALUE str)
1337 return str_new_shared(rb_obj_class(str), str);
1340 VALUE
1341 rb_str_new_frozen(VALUE orig)
1343 if (OBJ_FROZEN(orig)) return orig;
1344 return str_new_frozen(rb_obj_class(orig), orig);
1347 static VALUE
1348 rb_str_new_frozen_String(VALUE orig)
1350 if (OBJ_FROZEN(orig) && rb_obj_class(orig) == rb_cString) return orig;
1351 return str_new_frozen(rb_cString, orig);
1354 VALUE
1355 rb_str_tmp_frozen_acquire(VALUE orig)
1357 if (OBJ_FROZEN_RAW(orig)) return orig;
1358 return str_new_frozen_buffer(0, orig, FALSE);
1361 void
1362 rb_str_tmp_frozen_release(VALUE orig, VALUE tmp)
1364 if (RBASIC_CLASS(tmp) != 0)
1365 return;
1367 if (STR_EMBED_P(tmp)) {
1368 assert(OBJ_FROZEN_RAW(tmp));
1370 else if (FL_TEST_RAW(orig, STR_SHARED) &&
1371 !FL_TEST_RAW(orig, STR_TMPLOCK|RUBY_FL_FREEZE)) {
1372 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1374 if (shared == tmp && !FL_TEST_RAW(tmp, STR_BORROWED)) {
1375 assert(RSTRING(orig)->as.heap.ptr == RSTRING(tmp)->as.heap.ptr);
1376 assert(RSTRING(orig)->as.heap.len == RSTRING(tmp)->as.heap.len);
1378 /* Unshare orig since the root (tmp) only has this one child. */
1379 FL_UNSET_RAW(orig, STR_SHARED);
1380 RSTRING(orig)->as.heap.aux.capa = RSTRING(tmp)->as.heap.aux.capa;
1381 RBASIC(orig)->flags |= RBASIC(tmp)->flags & STR_NOFREE;
1382 assert(OBJ_FROZEN_RAW(tmp));
1384 /* Make tmp embedded and empty so it is safe for sweeping. */
1385 STR_SET_EMBED(tmp);
1386 STR_SET_EMBED_LEN(tmp, 0);
1391 static VALUE
1392 str_new_frozen(VALUE klass, VALUE orig)
1394 return str_new_frozen_buffer(klass, orig, TRUE);
1397 static VALUE
1398 heap_str_make_shared(VALUE klass, VALUE orig)
1400 assert(!STR_EMBED_P(orig));
1401 assert(!STR_SHARED_P(orig));
1403 VALUE str = str_alloc_heap(klass);
1404 STR_SET_NOEMBED(str);
1405 RSTRING(str)->as.heap.len = RSTRING_LEN(orig);
1406 RSTRING(str)->as.heap.ptr = RSTRING_PTR(orig);
1407 RSTRING(str)->as.heap.aux.capa = RSTRING(orig)->as.heap.aux.capa;
1408 RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1409 RBASIC(orig)->flags &= ~STR_NOFREE;
1410 STR_SET_SHARED(orig, str);
1411 if (klass == 0)
1412 FL_UNSET_RAW(str, STR_BORROWED);
1413 return str;
1416 static VALUE
1417 str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding)
1419 VALUE str;
1421 long len = RSTRING_LEN(orig);
1422 int termlen = copy_encoding ? TERM_LEN(orig) : 1;
1424 if (STR_EMBED_P(orig) || STR_EMBEDDABLE_P(len, termlen)) {
1425 str = str_new0(klass, RSTRING_PTR(orig), len, termlen);
1426 assert(STR_EMBED_P(str));
1428 else {
1429 if (FL_TEST_RAW(orig, STR_SHARED)) {
1430 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1431 long ofs = RSTRING(orig)->as.heap.ptr - RSTRING_PTR(shared);
1432 long rest = RSTRING_LEN(shared) - ofs - RSTRING(orig)->as.heap.len;
1433 assert(ofs >= 0);
1434 assert(rest >= 0);
1435 assert(ofs + rest <= RSTRING_LEN(shared));
1436 #if !USE_RVARGC
1437 assert(!STR_EMBED_P(shared));
1438 #endif
1439 assert(OBJ_FROZEN(shared));
1441 if ((ofs > 0) || (rest > 0) ||
1442 (klass != RBASIC(shared)->klass) ||
1443 ENCODING_GET(shared) != ENCODING_GET(orig)) {
1444 str = str_new_shared(klass, shared);
1445 assert(!STR_EMBED_P(str));
1446 RSTRING(str)->as.heap.ptr += ofs;
1447 RSTRING(str)->as.heap.len -= ofs + rest;
1449 else {
1450 if (RBASIC_CLASS(shared) == 0)
1451 FL_SET_RAW(shared, STR_BORROWED);
1452 return shared;
1455 else if (STR_EMBEDDABLE_P(RSTRING_LEN(orig), TERM_LEN(orig))) {
1456 str = str_alloc_embed(klass, RSTRING_LEN(orig) + TERM_LEN(orig));
1457 STR_SET_EMBED(str);
1458 memcpy(RSTRING_PTR(str), RSTRING_PTR(orig), RSTRING_LEN(orig));
1459 STR_SET_EMBED_LEN(str, RSTRING_LEN(orig));
1460 TERM_FILL(RSTRING_END(str), TERM_LEN(orig));
1462 else {
1463 str = heap_str_make_shared(klass, orig);
1467 if (copy_encoding) rb_enc_cr_str_exact_copy(str, orig);
1468 OBJ_FREEZE(str);
1469 return str;
1472 VALUE
1473 rb_str_new_with_class(VALUE obj, const char *ptr, long len)
1475 return str_new0(rb_obj_class(obj), ptr, len, TERM_LEN(obj));
1478 static VALUE
1479 str_new_empty_String(VALUE str)
1481 VALUE v = rb_str_new(0, 0);
1482 rb_enc_copy(v, str);
1483 return v;
1486 #define STR_BUF_MIN_SIZE 63
1487 #if !USE_RVARGC
1488 STATIC_ASSERT(STR_BUF_MIN_SIZE, STR_BUF_MIN_SIZE > RSTRING_EMBED_LEN_MAX);
1489 #endif
1491 VALUE
1492 rb_str_buf_new(long capa)
1494 if (STR_EMBEDDABLE_P(capa, 1)) {
1495 return str_alloc_embed(rb_cString, capa + 1);
1498 VALUE str = str_alloc_heap(rb_cString);
1500 #if !USE_RVARGC
1501 if (capa < STR_BUF_MIN_SIZE) {
1502 capa = STR_BUF_MIN_SIZE;
1504 #endif
1505 FL_SET(str, STR_NOEMBED);
1506 RSTRING(str)->as.heap.aux.capa = capa;
1507 RSTRING(str)->as.heap.ptr = ALLOC_N(char, (size_t)capa + 1);
1508 RSTRING(str)->as.heap.ptr[0] = '\0';
1510 return str;
1513 VALUE
1514 rb_str_buf_new_cstr(const char *ptr)
1516 VALUE str;
1517 long len = strlen(ptr);
1519 str = rb_str_buf_new(len);
1520 rb_str_buf_cat(str, ptr, len);
1522 return str;
1525 VALUE
1526 rb_str_tmp_new(long len)
1528 return str_new(0, 0, len);
1531 void
1532 rb_str_free(VALUE str)
1534 if (FL_TEST(str, RSTRING_FSTR)) {
1535 st_data_t fstr = (st_data_t)str;
1537 RB_VM_LOCK_ENTER();
1539 st_delete(rb_vm_fstring_table(), &fstr, NULL);
1540 RB_DEBUG_COUNTER_INC(obj_str_fstr);
1542 RB_VM_LOCK_LEAVE();
1545 if (STR_EMBED_P(str)) {
1546 RB_DEBUG_COUNTER_INC(obj_str_embed);
1548 else if (FL_TEST(str, STR_SHARED | STR_NOFREE)) {
1549 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_SHARED));
1550 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_NOFREE));
1552 else {
1553 RB_DEBUG_COUNTER_INC(obj_str_ptr);
1554 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
1558 RUBY_FUNC_EXPORTED size_t
1559 rb_str_memsize(VALUE str)
1561 if (FL_TEST(str, STR_NOEMBED|STR_SHARED|STR_NOFREE) == STR_NOEMBED) {
1562 return STR_HEAP_SIZE(str);
1564 else {
1565 return 0;
1569 VALUE
1570 rb_str_to_str(VALUE str)
1572 return rb_convert_type_with_id(str, T_STRING, "String", idTo_str);
1575 static inline void str_discard(VALUE str);
1576 static void str_shared_replace(VALUE str, VALUE str2);
1578 void
1579 rb_str_shared_replace(VALUE str, VALUE str2)
1581 if (str != str2) str_shared_replace(str, str2);
1584 static void
1585 str_shared_replace(VALUE str, VALUE str2)
1587 rb_encoding *enc;
1588 int cr;
1589 int termlen;
1591 RUBY_ASSERT(str2 != str);
1592 enc = STR_ENC_GET(str2);
1593 cr = ENC_CODERANGE(str2);
1594 str_discard(str);
1595 termlen = rb_enc_mbminlen(enc);
1597 if (str_embed_capa(str) >= RSTRING_LEN(str2) + termlen) {
1598 STR_SET_EMBED(str);
1599 memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), (size_t)RSTRING_LEN(str2) + termlen);
1600 STR_SET_EMBED_LEN(str, RSTRING_LEN(str2));
1601 rb_enc_associate(str, enc);
1602 ENC_CODERANGE_SET(str, cr);
1604 else {
1605 #if USE_RVARGC
1606 if (STR_EMBED_P(str2)) {
1607 assert(!FL_TEST(str2, STR_SHARED));
1608 long len = RSTRING(str2)->as.embed.len;
1609 assert(len + termlen <= str_embed_capa(str2));
1611 char *new_ptr = ALLOC_N(char, len + termlen);
1612 memcpy(new_ptr, RSTRING(str2)->as.embed.ary, len + termlen);
1613 RSTRING(str2)->as.heap.ptr = new_ptr;
1614 RSTRING(str2)->as.heap.len = len;
1615 RSTRING(str2)->as.heap.aux.capa = len;
1616 STR_SET_NOEMBED(str2);
1618 #endif
1620 STR_SET_NOEMBED(str);
1621 FL_UNSET(str, STR_SHARED);
1622 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1623 RSTRING(str)->as.heap.len = RSTRING_LEN(str2);
1625 if (FL_TEST(str2, STR_SHARED)) {
1626 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1627 STR_SET_SHARED(str, shared);
1629 else {
1630 RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
1633 /* abandon str2 */
1634 STR_SET_EMBED(str2);
1635 RSTRING_PTR(str2)[0] = 0;
1636 STR_SET_EMBED_LEN(str2, 0);
1637 rb_enc_associate(str, enc);
1638 ENC_CODERANGE_SET(str, cr);
1642 VALUE
1643 rb_obj_as_string(VALUE obj)
1645 VALUE str;
1647 if (RB_TYPE_P(obj, T_STRING)) {
1648 return obj;
1650 str = rb_funcall(obj, idTo_s, 0);
1651 return rb_obj_as_string_result(str, obj);
1654 MJIT_FUNC_EXPORTED VALUE
1655 rb_obj_as_string_result(VALUE str, VALUE obj)
1657 if (!RB_TYPE_P(str, T_STRING))
1658 return rb_any_to_s(obj);
1659 return str;
1662 static VALUE
1663 str_replace(VALUE str, VALUE str2)
1665 long len;
1667 len = RSTRING_LEN(str2);
1668 if (STR_SHARED_P(str2)) {
1669 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1670 assert(OBJ_FROZEN(shared));
1671 STR_SET_NOEMBED(str);
1672 RSTRING(str)->as.heap.len = len;
1673 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1674 STR_SET_SHARED(str, shared);
1675 rb_enc_cr_str_exact_copy(str, str2);
1677 else {
1678 str_replace_shared(str, str2);
1681 return str;
1684 static inline VALUE
1685 ec_str_alloc(struct rb_execution_context_struct *ec, VALUE klass, size_t size)
1687 assert(size > 0);
1688 RB_RVARGC_EC_NEWOBJ_OF(ec, str, struct RString, klass,
1689 T_STRING | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), size);
1690 return (VALUE)str;
1693 static inline VALUE
1694 ec_str_alloc_embed(struct rb_execution_context_struct *ec, VALUE klass, size_t capa)
1696 size_t size = str_embed_size(capa);
1697 assert(rb_gc_size_allocatable_p(size));
1698 #if !USE_RVARGC
1699 assert(size <= sizeof(struct RString));
1700 #endif
1701 return ec_str_alloc(ec, klass, size);
1704 static inline VALUE
1705 ec_str_alloc_heap(struct rb_execution_context_struct *ec, VALUE klass)
1707 return ec_str_alloc(ec, klass, sizeof(struct RString));
1710 static inline VALUE
1711 str_duplicate_setup(VALUE klass, VALUE str, VALUE dup)
1713 const VALUE flag_mask =
1714 #if !USE_RVARGC
1715 RSTRING_NOEMBED | RSTRING_EMBED_LEN_MASK |
1716 #endif
1717 ENC_CODERANGE_MASK | ENCODING_MASK |
1718 FL_FREEZE
1720 VALUE flags = FL_TEST_RAW(str, flag_mask);
1721 int encidx = 0;
1722 if (STR_EMBED_P(str)) {
1723 long len = RSTRING_EMBED_LEN(str);
1725 assert(str_embed_capa(dup) >= len + 1);
1726 STR_SET_EMBED_LEN(dup, len);
1727 MEMCPY(RSTRING(dup)->as.embed.ary, RSTRING(str)->as.embed.ary, char, len + 1);
1729 else {
1730 VALUE root = str;
1731 if (FL_TEST_RAW(str, STR_SHARED)) {
1732 root = RSTRING(str)->as.heap.aux.shared;
1734 else if (UNLIKELY(!(flags & FL_FREEZE))) {
1735 root = str = str_new_frozen(klass, str);
1736 flags = FL_TEST_RAW(str, flag_mask);
1738 assert(!STR_SHARED_P(root));
1739 assert(RB_OBJ_FROZEN_RAW(root));
1740 #if USE_RVARGC
1741 if (1) {
1742 #else
1743 if (STR_EMBED_P(root)) {
1744 MEMCPY(RSTRING(dup)->as.embed.ary, RSTRING(root)->as.embed.ary,
1745 char, RSTRING_EMBED_LEN_MAX + 1);
1747 else {
1748 #endif
1749 RSTRING(dup)->as.heap.len = RSTRING_LEN(str);
1750 RSTRING(dup)->as.heap.ptr = RSTRING_PTR(str);
1751 RB_OBJ_WRITE(dup, &RSTRING(dup)->as.heap.aux.shared, root);
1752 flags |= RSTRING_NOEMBED | STR_SHARED;
1756 if ((flags & ENCODING_MASK) == (ENCODING_INLINE_MAX<<ENCODING_SHIFT)) {
1757 encidx = rb_enc_get_index(str);
1758 flags &= ~ENCODING_MASK;
1760 FL_SET_RAW(dup, flags & ~FL_FREEZE);
1761 if (encidx) rb_enc_associate_index(dup, encidx);
1762 return dup;
1765 static inline VALUE
1766 ec_str_duplicate(struct rb_execution_context_struct *ec, VALUE klass, VALUE str)
1768 VALUE dup;
1769 if (!USE_RVARGC || FL_TEST(str, STR_NOEMBED)) {
1770 dup = ec_str_alloc_heap(ec, klass);
1772 else {
1773 dup = ec_str_alloc_embed(ec, klass, RSTRING_EMBED_LEN(str) + TERM_LEN(str));
1776 return str_duplicate_setup(klass, str, dup);
1779 static inline VALUE
1780 str_duplicate(VALUE klass, VALUE str)
1782 VALUE dup;
1783 if (!USE_RVARGC || FL_TEST(str, STR_NOEMBED)) {
1784 dup = str_alloc_heap(klass);
1786 else {
1787 dup = str_alloc_embed(klass, RSTRING_EMBED_LEN(str) + TERM_LEN(str));
1790 return str_duplicate_setup(klass, str, dup);
1793 VALUE
1794 rb_str_dup(VALUE str)
1796 return str_duplicate(rb_obj_class(str), str);
1799 VALUE
1800 rb_str_resurrect(VALUE str)
1802 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1803 return str_duplicate(rb_cString, str);
1806 VALUE
1807 rb_ec_str_resurrect(struct rb_execution_context_struct *ec, VALUE str)
1809 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1810 return ec_str_duplicate(ec, rb_cString, str);
1814 * call-seq:
1815 * String.new(string = '') -> new_string
1816 * String.new(string = '', encoding: encoding) -> new_string
1817 * String.new(string = '', capacity: size) -> new_string
1819 * Returns a new \String that is a copy of +string+.
1821 * With no arguments, returns the empty string with the Encoding <tt>ASCII-8BIT</tt>:
1822 * s = String.new
1823 * s # => ""
1824 * s.encoding # => #<Encoding:ASCII-8BIT>
1826 * With the single \String argument +string+, returns a copy of +string+
1827 * with the same encoding as +string+:
1828 * s = String.new("Que veut dire \u{e7}a?")
1829 * s # => "Que veut dire \u{e7}a?"
1830 * s.encoding # => #<Encoding:UTF-8>
1832 * Literal strings like <tt>""</tt> or here-documents always use
1833 * {script encoding}[Encoding.html#class-Encoding-label-Script+encoding], unlike String.new.
1835 * With keyword +encoding+, returns a copy of +str+
1836 * with the specified encoding:
1837 * s = String.new(encoding: 'ASCII')
1838 * s.encoding # => #<Encoding:US-ASCII>
1839 * s = String.new('foo', encoding: 'ASCII')
1840 * s.encoding # => #<Encoding:US-ASCII>
1842 * Note that these are equivalent:
1843 * s0 = String.new('foo', encoding: 'ASCII')
1844 * s1 = 'foo'.force_encoding('ASCII')
1845 * s0.encoding == s1.encoding # => true
1847 * With keyword +capacity+, returns a copy of +str+;
1848 * the given +capacity+ may set the size of the internal buffer,
1849 * which may affect performance:
1850 * String.new(capacity: 1) # => ""
1851 * String.new(capacity: 4096) # => ""
1853 * The +string+, +encoding+, and +capacity+ arguments may all be used together:
1855 * String.new('hello', encoding: 'UTF-8', capacity: 25)
1859 static VALUE
1860 rb_str_init(int argc, VALUE *argv, VALUE str)
1862 static ID keyword_ids[2];
1863 VALUE orig, opt, venc, vcapa;
1864 VALUE kwargs[2];
1865 rb_encoding *enc = 0;
1866 int n;
1868 if (!keyword_ids[0]) {
1869 keyword_ids[0] = rb_id_encoding();
1870 CONST_ID(keyword_ids[1], "capacity");
1873 n = rb_scan_args(argc, argv, "01:", &orig, &opt);
1874 if (!NIL_P(opt)) {
1875 rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
1876 venc = kwargs[0];
1877 vcapa = kwargs[1];
1878 if (venc != Qundef && !NIL_P(venc)) {
1879 enc = rb_to_encoding(venc);
1881 if (vcapa != Qundef && !NIL_P(vcapa)) {
1882 long capa = NUM2LONG(vcapa);
1883 long len = 0;
1884 int termlen = enc ? rb_enc_mbminlen(enc) : 1;
1886 if (capa < STR_BUF_MIN_SIZE) {
1887 capa = STR_BUF_MIN_SIZE;
1889 if (n == 1) {
1890 StringValue(orig);
1891 len = RSTRING_LEN(orig);
1892 if (capa < len) {
1893 capa = len;
1895 if (orig == str) n = 0;
1897 str_modifiable(str);
1898 if (STR_EMBED_P(str)) { /* make noembed always */
1899 char *new_ptr = ALLOC_N(char, (size_t)capa + termlen);
1900 #if USE_RVARGC
1901 assert(RSTRING(str)->as.embed.len + 1 <= str_embed_capa(str));
1902 memcpy(new_ptr, RSTRING(str)->as.embed.ary, RSTRING(str)->as.embed.len + 1);
1903 #else
1904 memcpy(new_ptr, RSTRING(str)->as.embed.ary, RSTRING_EMBED_LEN_MAX + 1);
1905 #endif
1906 RSTRING(str)->as.heap.ptr = new_ptr;
1908 else if (FL_TEST(str, STR_SHARED|STR_NOFREE)) {
1909 const size_t size = (size_t)capa + termlen;
1910 const char *const old_ptr = RSTRING_PTR(str);
1911 const size_t osize = RSTRING(str)->as.heap.len + TERM_LEN(str);
1912 char *new_ptr = ALLOC_N(char, (size_t)capa + termlen);
1913 memcpy(new_ptr, old_ptr, osize < size ? osize : size);
1914 FL_UNSET_RAW(str, STR_SHARED|STR_NOFREE);
1915 RSTRING(str)->as.heap.ptr = new_ptr;
1917 else if (STR_HEAP_SIZE(str) != (size_t)capa + termlen) {
1918 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
1919 (size_t)capa + termlen, STR_HEAP_SIZE(str));
1921 RSTRING(str)->as.heap.len = len;
1922 TERM_FILL(&RSTRING(str)->as.heap.ptr[len], termlen);
1923 if (n == 1) {
1924 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), len);
1925 rb_enc_cr_str_exact_copy(str, orig);
1927 FL_SET(str, STR_NOEMBED);
1928 RSTRING(str)->as.heap.aux.capa = capa;
1930 else if (n == 1) {
1931 rb_str_replace(str, orig);
1933 if (enc) {
1934 rb_enc_associate(str, enc);
1935 ENC_CODERANGE_CLEAR(str);
1938 else if (n == 1) {
1939 rb_str_replace(str, orig);
1941 return str;
1944 #ifdef NONASCII_MASK
1945 #define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
1948 * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx
1949 * bit representation. (see https://en.wikipedia.org/wiki/UTF-8)
1950 * Therefore, the following pseudocode can detect UTF-8 leading bytes.
1952 * if (!(byte & 0x80))
1953 * byte |= 0x40; // turn on bit6
1954 * return ((byte>>6) & 1); // bit6 represent whether this byte is leading or not.
1956 * This function calculates whether a byte is leading or not for all bytes
1957 * in the argument word by concurrently using the above logic, and then
1958 * adds up the number of leading bytes in the word.
1960 static inline uintptr_t
1961 count_utf8_lead_bytes_with_word(const uintptr_t *s)
1963 uintptr_t d = *s;
1965 /* Transform so that bit0 indicates whether we have a UTF-8 leading byte or not. */
1966 d = (d>>6) | (~d>>7);
1967 d &= NONASCII_MASK >> 7;
1969 /* Gather all bytes. */
1970 #if defined(HAVE_BUILTIN___BUILTIN_POPCOUNT) && defined(__POPCNT__)
1971 /* use only if it can use POPCNT */
1972 return rb_popcount_intptr(d);
1973 #else
1974 d += (d>>8);
1975 d += (d>>16);
1976 # if SIZEOF_VOIDP == 8
1977 d += (d>>32);
1978 # endif
1979 return (d&0xF);
1980 #endif
1982 #endif
1984 static inline long
1985 enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
1987 long c;
1988 const char *q;
1990 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
1991 long diff = (long)(e - p);
1992 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
1994 #ifdef NONASCII_MASK
1995 else if (cr == ENC_CODERANGE_VALID && enc == rb_utf8_encoding()) {
1996 uintptr_t len = 0;
1997 if ((int)sizeof(uintptr_t) * 2 < e - p) {
1998 const uintptr_t *s, *t;
1999 const uintptr_t lowbits = sizeof(uintptr_t) - 1;
2000 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2001 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2002 while (p < (const char *)s) {
2003 if (is_utf8_lead_byte(*p)) len++;
2004 p++;
2006 while (s < t) {
2007 len += count_utf8_lead_bytes_with_word(s);
2008 s++;
2010 p = (const char *)s;
2012 while (p < e) {
2013 if (is_utf8_lead_byte(*p)) len++;
2014 p++;
2016 return (long)len;
2018 #endif
2019 else if (rb_enc_asciicompat(enc)) {
2020 c = 0;
2021 if (ENC_CODERANGE_CLEAN_P(cr)) {
2022 while (p < e) {
2023 if (ISASCII(*p)) {
2024 q = search_nonascii(p, e);
2025 if (!q)
2026 return c + (e - p);
2027 c += q - p;
2028 p = q;
2030 p += rb_enc_fast_mbclen(p, e, enc);
2031 c++;
2034 else {
2035 while (p < e) {
2036 if (ISASCII(*p)) {
2037 q = search_nonascii(p, e);
2038 if (!q)
2039 return c + (e - p);
2040 c += q - p;
2041 p = q;
2043 p += rb_enc_mbclen(p, e, enc);
2044 c++;
2047 return c;
2050 for (c=0; p<e; c++) {
2051 p += rb_enc_mbclen(p, e, enc);
2053 return c;
2056 long
2057 rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
2059 return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
2062 /* To get strlen with cr
2063 * Note that given cr is not used.
2065 long
2066 rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
2068 long c;
2069 const char *q;
2070 int ret;
2072 *cr = 0;
2073 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2074 long diff = (long)(e - p);
2075 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2077 else if (rb_enc_asciicompat(enc)) {
2078 c = 0;
2079 while (p < e) {
2080 if (ISASCII(*p)) {
2081 q = search_nonascii(p, e);
2082 if (!q) {
2083 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2084 return c + (e - p);
2086 c += q - p;
2087 p = q;
2089 ret = rb_enc_precise_mbclen(p, e, enc);
2090 if (MBCLEN_CHARFOUND_P(ret)) {
2091 *cr |= ENC_CODERANGE_VALID;
2092 p += MBCLEN_CHARFOUND_LEN(ret);
2094 else {
2095 *cr = ENC_CODERANGE_BROKEN;
2096 p++;
2098 c++;
2100 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2101 return c;
2104 for (c=0; p<e; c++) {
2105 ret = rb_enc_precise_mbclen(p, e, enc);
2106 if (MBCLEN_CHARFOUND_P(ret)) {
2107 *cr |= ENC_CODERANGE_VALID;
2108 p += MBCLEN_CHARFOUND_LEN(ret);
2110 else {
2111 *cr = ENC_CODERANGE_BROKEN;
2112 if (p + rb_enc_mbminlen(enc) <= e)
2113 p += rb_enc_mbminlen(enc);
2114 else
2115 p = e;
2118 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2119 return c;
2122 /* enc must be str's enc or rb_enc_check(str, str2) */
2123 static long
2124 str_strlen(VALUE str, rb_encoding *enc)
2126 const char *p, *e;
2127 int cr;
2129 if (single_byte_optimizable(str)) return RSTRING_LEN(str);
2130 if (!enc) enc = STR_ENC_GET(str);
2131 p = RSTRING_PTR(str);
2132 e = RSTRING_END(str);
2133 cr = ENC_CODERANGE(str);
2135 if (cr == ENC_CODERANGE_UNKNOWN) {
2136 long n = rb_enc_strlen_cr(p, e, enc, &cr);
2137 if (cr) ENC_CODERANGE_SET(str, cr);
2138 return n;
2140 else {
2141 return enc_strlen(p, e, enc, cr);
2145 long
2146 rb_str_strlen(VALUE str)
2148 return str_strlen(str, NULL);
2152 * call-seq:
2153 * length -> integer
2155 * Returns the count of characters (not bytes) in +self+:
2157 * "\x80\u3042".length # => 2
2158 * "hello".length # => 5
2160 * String#size is an alias for String#length.
2162 * Related: String#bytesize.
2165 VALUE
2166 rb_str_length(VALUE str)
2168 return LONG2NUM(str_strlen(str, NULL));
2172 * call-seq:
2173 * bytesize -> integer
2175 * Returns the count of bytes in +self+:
2177 * "\x80\u3042".bytesize # => 4
2178 * "hello".bytesize # => 5
2180 * Related: String#length.
2183 static VALUE
2184 rb_str_bytesize(VALUE str)
2186 return LONG2NUM(RSTRING_LEN(str));
2190 * call-seq:
2191 * empty? -> true or false
2193 * Returns +true+ if the length of +self+ is zero, +false+ otherwise:
2195 * "hello".empty? # => false
2196 * " ".empty? # => false
2197 * "".empty? # => true
2201 static VALUE
2202 rb_str_empty(VALUE str)
2204 return RBOOL(RSTRING_LEN(str) == 0);
2208 * call-seq:
2209 * string + other_string -> new_string
2211 * Returns a new \String containing +other_string+ concatenated to +self+:
2213 * "Hello from " + self.to_s # => "Hello from main"
2217 VALUE
2218 rb_str_plus(VALUE str1, VALUE str2)
2220 VALUE str3;
2221 rb_encoding *enc;
2222 char *ptr1, *ptr2, *ptr3;
2223 long len1, len2;
2224 int termlen;
2226 StringValue(str2);
2227 enc = rb_enc_check_str(str1, str2);
2228 RSTRING_GETMEM(str1, ptr1, len1);
2229 RSTRING_GETMEM(str2, ptr2, len2);
2230 termlen = rb_enc_mbminlen(enc);
2231 if (len1 > LONG_MAX - len2) {
2232 rb_raise(rb_eArgError, "string size too big");
2234 str3 = str_new0(rb_cString, 0, len1+len2, termlen);
2235 ptr3 = RSTRING_PTR(str3);
2236 memcpy(ptr3, ptr1, len1);
2237 memcpy(ptr3+len1, ptr2, len2);
2238 TERM_FILL(&ptr3[len1+len2], termlen);
2240 ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc),
2241 ENC_CODERANGE_AND(ENC_CODERANGE(str1), ENC_CODERANGE(str2)));
2242 RB_GC_GUARD(str1);
2243 RB_GC_GUARD(str2);
2244 return str3;
2247 /* A variant of rb_str_plus that does not raise but return Qundef instead. */
2248 MJIT_FUNC_EXPORTED VALUE
2249 rb_str_opt_plus(VALUE str1, VALUE str2)
2251 assert(RBASIC_CLASS(str1) == rb_cString);
2252 assert(RBASIC_CLASS(str2) == rb_cString);
2253 long len1, len2;
2254 MAYBE_UNUSED(char) *ptr1, *ptr2;
2255 RSTRING_GETMEM(str1, ptr1, len1);
2256 RSTRING_GETMEM(str2, ptr2, len2);
2257 int enc1 = rb_enc_get_index(str1);
2258 int enc2 = rb_enc_get_index(str2);
2260 if (enc1 < 0) {
2261 return Qundef;
2263 else if (enc2 < 0) {
2264 return Qundef;
2266 else if (enc1 != enc2) {
2267 return Qundef;
2269 else if (len1 > LONG_MAX - len2) {
2270 return Qundef;
2272 else {
2273 return rb_str_plus(str1, str2);
2279 * call-seq:
2280 * string * integer -> new_string
2282 * Returns a new \String containing +integer+ copies of +self+:
2284 * "Ho! " * 3 # => "Ho! Ho! Ho! "
2285 * "Ho! " * 0 # => ""
2289 VALUE
2290 rb_str_times(VALUE str, VALUE times)
2292 VALUE str2;
2293 long n, len;
2294 char *ptr2;
2295 int termlen;
2297 if (times == INT2FIX(1)) {
2298 return str_duplicate(rb_cString, str);
2300 if (times == INT2FIX(0)) {
2301 str2 = str_alloc_embed(rb_cString, 0);
2302 rb_enc_copy(str2, str);
2303 return str2;
2305 len = NUM2LONG(times);
2306 if (len < 0) {
2307 rb_raise(rb_eArgError, "negative argument");
2309 if (RSTRING_LEN(str) == 1 && RSTRING_PTR(str)[0] == 0) {
2310 if (STR_EMBEDDABLE_P(len, 1)) {
2311 str2 = str_alloc_embed(rb_cString, len + 1);
2312 memset(RSTRING_PTR(str2), 0, len + 1);
2314 else {
2315 str2 = str_alloc_heap(rb_cString);
2316 RSTRING(str2)->as.heap.aux.capa = len;
2317 RSTRING(str2)->as.heap.ptr = ZALLOC_N(char, (size_t)len + 1);
2318 STR_SET_NOEMBED(str2);
2320 STR_SET_LEN(str2, len);
2321 rb_enc_copy(str2, str);
2322 return str2;
2324 if (len && LONG_MAX/len < RSTRING_LEN(str)) {
2325 rb_raise(rb_eArgError, "argument too big");
2328 len *= RSTRING_LEN(str);
2329 termlen = TERM_LEN(str);
2330 str2 = str_new0(rb_cString, 0, len, termlen);
2331 ptr2 = RSTRING_PTR(str2);
2332 if (len) {
2333 n = RSTRING_LEN(str);
2334 memcpy(ptr2, RSTRING_PTR(str), n);
2335 while (n <= len/2) {
2336 memcpy(ptr2 + n, ptr2, n);
2337 n *= 2;
2339 memcpy(ptr2 + n, ptr2, len-n);
2341 STR_SET_LEN(str2, len);
2342 TERM_FILL(&ptr2[len], termlen);
2343 rb_enc_cr_str_copy_for_substr(str2, str);
2345 return str2;
2349 * call-seq:
2350 * string % object -> new_string
2352 * Returns the result of formatting +object+ into the format specification +self+
2353 * (see Kernel#sprintf for formatting details):
2355 * "%05d" % 123 # => "00123"
2357 * If +self+ contains multiple substitutions, +object+ must be
2358 * an \Array or \Hash containing the values to be substituted:
2360 * "%-5s: %016x" % [ "ID", self.object_id ] # => "ID : 00002b054ec93168"
2361 * "foo = %{foo}" % {foo: 'bar'} # => "foo = bar"
2362 * "foo = %{foo}, baz = %{baz}" % {foo: 'bar', baz: 'bat'} # => "foo = bar, baz = bat"
2366 static VALUE
2367 rb_str_format_m(VALUE str, VALUE arg)
2369 VALUE tmp = rb_check_array_type(arg);
2371 if (!NIL_P(tmp)) {
2372 return rb_str_format(RARRAY_LENINT(tmp), RARRAY_CONST_PTR(tmp), str);
2374 return rb_str_format(1, &arg, str);
2377 static inline void
2378 rb_check_lockedtmp(VALUE str)
2380 if (FL_TEST(str, STR_TMPLOCK)) {
2381 rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
2385 static inline void
2386 str_modifiable(VALUE str)
2388 rb_check_lockedtmp(str);
2389 rb_check_frozen(str);
2392 static inline int
2393 str_dependent_p(VALUE str)
2395 if (STR_EMBED_P(str) || !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2396 return 0;
2398 else {
2399 return 1;
2403 static inline int
2404 str_independent(VALUE str)
2406 str_modifiable(str);
2407 return !str_dependent_p(str);
2410 static void
2411 str_make_independent_expand(VALUE str, long len, long expand, const int termlen)
2413 char *ptr;
2414 char *oldptr;
2415 long capa = len + expand;
2417 if (len > capa) len = capa;
2419 if (!STR_EMBED_P(str) && str_embed_capa(str) >= capa + termlen) {
2420 ptr = RSTRING(str)->as.heap.ptr;
2421 STR_SET_EMBED(str);
2422 memcpy(RSTRING(str)->as.embed.ary, ptr, len);
2423 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
2424 STR_SET_EMBED_LEN(str, len);
2425 return;
2428 ptr = ALLOC_N(char, (size_t)capa + termlen);
2429 oldptr = RSTRING_PTR(str);
2430 if (oldptr) {
2431 memcpy(ptr, oldptr, len);
2433 if (FL_TEST_RAW(str, STR_NOEMBED|STR_NOFREE|STR_SHARED) == STR_NOEMBED) {
2434 xfree(oldptr);
2436 STR_SET_NOEMBED(str);
2437 FL_UNSET(str, STR_SHARED|STR_NOFREE);
2438 TERM_FILL(ptr + len, termlen);
2439 RSTRING(str)->as.heap.ptr = ptr;
2440 RSTRING(str)->as.heap.len = len;
2441 RSTRING(str)->as.heap.aux.capa = capa;
2444 void
2445 rb_str_modify(VALUE str)
2447 if (!str_independent(str))
2448 str_make_independent(str);
2449 ENC_CODERANGE_CLEAR(str);
2452 void
2453 rb_str_modify_expand(VALUE str, long expand)
2455 int termlen = TERM_LEN(str);
2456 long len = RSTRING_LEN(str);
2458 if (expand < 0) {
2459 rb_raise(rb_eArgError, "negative expanding string size");
2461 if (expand >= LONG_MAX - len) {
2462 rb_raise(rb_eArgError, "string size too big");
2465 if (!str_independent(str)) {
2466 str_make_independent_expand(str, len, expand, termlen);
2468 else if (expand > 0) {
2469 RESIZE_CAPA_TERM(str, len + expand, termlen);
2471 ENC_CODERANGE_CLEAR(str);
2474 /* As rb_str_modify(), but don't clear coderange */
2475 static void
2476 str_modify_keep_cr(VALUE str)
2478 if (!str_independent(str))
2479 str_make_independent(str);
2480 if (ENC_CODERANGE(str) == ENC_CODERANGE_BROKEN)
2481 /* Force re-scan later */
2482 ENC_CODERANGE_CLEAR(str);
2485 static inline void
2486 str_discard(VALUE str)
2488 str_modifiable(str);
2489 if (!STR_EMBED_P(str) && !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2490 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
2491 RSTRING(str)->as.heap.ptr = 0;
2492 RSTRING(str)->as.heap.len = 0;
2496 void
2497 rb_must_asciicompat(VALUE str)
2499 rb_encoding *enc = rb_enc_get(str);
2500 if (!rb_enc_asciicompat(enc)) {
2501 rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
2505 VALUE
2506 rb_string_value(volatile VALUE *ptr)
2508 VALUE s = *ptr;
2509 if (!RB_TYPE_P(s, T_STRING)) {
2510 s = rb_str_to_str(s);
2511 *ptr = s;
2513 return s;
2516 char *
2517 rb_string_value_ptr(volatile VALUE *ptr)
2519 VALUE str = rb_string_value(ptr);
2520 return RSTRING_PTR(str);
2523 static int
2524 zero_filled(const char *s, int n)
2526 for (; n > 0; --n) {
2527 if (*s++) return 0;
2529 return 1;
2532 static const char *
2533 str_null_char(const char *s, long len, const int minlen, rb_encoding *enc)
2535 const char *e = s + len;
2537 for (; s + minlen <= e; s += rb_enc_mbclen(s, e, enc)) {
2538 if (zero_filled(s, minlen)) return s;
2540 return 0;
2543 static char *
2544 str_fill_term(VALUE str, char *s, long len, int termlen)
2546 /* This function assumes that (capa + termlen) bytes of memory
2547 * is allocated, like many other functions in this file.
2549 if (str_dependent_p(str)) {
2550 if (!zero_filled(s + len, termlen))
2551 str_make_independent_expand(str, len, 0L, termlen);
2553 else {
2554 TERM_FILL(s + len, termlen);
2555 return s;
2557 return RSTRING_PTR(str);
2560 void
2561 rb_str_change_terminator_length(VALUE str, const int oldtermlen, const int termlen)
2563 long capa = str_capacity(str, oldtermlen) + oldtermlen;
2564 long len = RSTRING_LEN(str);
2566 assert(capa >= len);
2567 if (capa - len < termlen) {
2568 rb_check_lockedtmp(str);
2569 str_make_independent_expand(str, len, 0L, termlen);
2571 else if (str_dependent_p(str)) {
2572 if (termlen > oldtermlen)
2573 str_make_independent_expand(str, len, 0L, termlen);
2575 else {
2576 if (!STR_EMBED_P(str)) {
2577 /* modify capa instead of realloc */
2578 assert(!FL_TEST((str), STR_SHARED));
2579 RSTRING(str)->as.heap.aux.capa = capa - termlen;
2581 if (termlen > oldtermlen) {
2582 TERM_FILL(RSTRING_PTR(str) + len, termlen);
2586 return;
2589 static char *
2590 str_null_check(VALUE str, int *w)
2592 char *s = RSTRING_PTR(str);
2593 long len = RSTRING_LEN(str);
2594 rb_encoding *enc = rb_enc_get(str);
2595 const int minlen = rb_enc_mbminlen(enc);
2597 if (minlen > 1) {
2598 *w = 1;
2599 if (str_null_char(s, len, minlen, enc)) {
2600 return NULL;
2602 return str_fill_term(str, s, len, minlen);
2604 *w = 0;
2605 if (!s || memchr(s, 0, len)) {
2606 return NULL;
2608 if (s[len]) {
2609 s = str_fill_term(str, s, len, minlen);
2611 return s;
2614 char *
2615 rb_str_to_cstr(VALUE str)
2617 int w;
2618 return str_null_check(str, &w);
2621 char *
2622 rb_string_value_cstr(volatile VALUE *ptr)
2624 VALUE str = rb_string_value(ptr);
2625 int w;
2626 char *s = str_null_check(str, &w);
2627 if (!s) {
2628 if (w) {
2629 rb_raise(rb_eArgError, "string contains null char");
2631 rb_raise(rb_eArgError, "string contains null byte");
2633 return s;
2636 char *
2637 rb_str_fill_terminator(VALUE str, const int newminlen)
2639 char *s = RSTRING_PTR(str);
2640 long len = RSTRING_LEN(str);
2641 return str_fill_term(str, s, len, newminlen);
2644 VALUE
2645 rb_check_string_type(VALUE str)
2647 str = rb_check_convert_type_with_id(str, T_STRING, "String", idTo_str);
2648 return str;
2652 * call-seq:
2653 * String.try_convert(object) -> object, new_string, or nil
2655 * If +object+ is a \String object, returns +object+.
2657 * Otherwise if +object+ responds to <tt>:to_str</tt>,
2658 * calls <tt>object.to_str</tt> and returns the result.
2660 * Returns +nil+ if +object+ does not respond to <tt>:to_str</tt>.
2662 * Raises an exception unless <tt>object.to_str</tt> returns a \String object.
2664 static VALUE
2665 rb_str_s_try_convert(VALUE dummy, VALUE str)
2667 return rb_check_string_type(str);
2670 static char*
2671 str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc)
2673 long nth = *nthp;
2674 if (rb_enc_mbmaxlen(enc) == 1) {
2675 p += nth;
2677 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2678 p += nth * rb_enc_mbmaxlen(enc);
2680 else if (rb_enc_asciicompat(enc)) {
2681 const char *p2, *e2;
2682 int n;
2684 while (p < e && 0 < nth) {
2685 e2 = p + nth;
2686 if (e < e2) {
2687 *nthp = nth;
2688 return (char *)e;
2690 if (ISASCII(*p)) {
2691 p2 = search_nonascii(p, e2);
2692 if (!p2) {
2693 nth -= e2 - p;
2694 *nthp = nth;
2695 return (char *)e2;
2697 nth -= p2 - p;
2698 p = p2;
2700 n = rb_enc_mbclen(p, e, enc);
2701 p += n;
2702 nth--;
2704 *nthp = nth;
2705 if (nth != 0) {
2706 return (char *)e;
2708 return (char *)p;
2710 else {
2711 while (p < e && nth--) {
2712 p += rb_enc_mbclen(p, e, enc);
2715 if (p > e) p = e;
2716 *nthp = nth;
2717 return (char*)p;
2720 char*
2721 rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
2723 return str_nth_len(p, e, &nth, enc);
2726 static char*
2727 str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
2729 if (singlebyte)
2730 p += nth;
2731 else {
2732 p = str_nth_len(p, e, &nth, enc);
2734 if (!p) return 0;
2735 if (p > e) p = e;
2736 return (char *)p;
2739 /* char offset to byte offset */
2740 static long
2741 str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
2743 const char *pp = str_nth(p, e, nth, enc, singlebyte);
2744 if (!pp) return e - p;
2745 return pp - p;
2748 long
2749 rb_str_offset(VALUE str, long pos)
2751 return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
2752 STR_ENC_GET(str), single_byte_optimizable(str));
2755 #ifdef NONASCII_MASK
2756 static char *
2757 str_utf8_nth(const char *p, const char *e, long *nthp)
2759 long nth = *nthp;
2760 if ((int)SIZEOF_VOIDP * 2 < e - p && (int)SIZEOF_VOIDP * 2 < nth) {
2761 const uintptr_t *s, *t;
2762 const uintptr_t lowbits = SIZEOF_VOIDP - 1;
2763 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2764 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2765 while (p < (const char *)s) {
2766 if (is_utf8_lead_byte(*p)) nth--;
2767 p++;
2769 do {
2770 nth -= count_utf8_lead_bytes_with_word(s);
2771 s++;
2772 } while (s < t && (int)SIZEOF_VOIDP <= nth);
2773 p = (char *)s;
2775 while (p < e) {
2776 if (is_utf8_lead_byte(*p)) {
2777 if (nth == 0) break;
2778 nth--;
2780 p++;
2782 *nthp = nth;
2783 return (char *)p;
2786 static long
2787 str_utf8_offset(const char *p, const char *e, long nth)
2789 const char *pp = str_utf8_nth(p, e, &nth);
2790 return pp - p;
2792 #endif
2794 /* byte offset to char offset */
2795 long
2796 rb_str_sublen(VALUE str, long pos)
2798 if (single_byte_optimizable(str) || pos < 0)
2799 return pos;
2800 else {
2801 char *p = RSTRING_PTR(str);
2802 return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
2806 VALUE
2807 rb_str_subseq(VALUE str, long beg, long len)
2809 VALUE str2;
2811 if (!STR_EMBEDDABLE_P(len, TERM_LEN(str)) &&
2812 SHARABLE_SUBSTRING_P(beg, len, RSTRING_LEN(str))) {
2813 long olen;
2814 str2 = rb_str_new_shared(rb_str_new_frozen_String(str));
2815 RSTRING(str2)->as.heap.ptr += beg;
2816 olen = RSTRING(str2)->as.heap.len;
2817 if (olen > len) RSTRING(str2)->as.heap.len = len;
2819 else {
2820 str2 = rb_str_new(RSTRING_PTR(str)+beg, len);
2821 RB_GC_GUARD(str);
2824 rb_enc_cr_str_copy_for_substr(str2, str);
2826 return str2;
2829 char *
2830 rb_str_subpos(VALUE str, long beg, long *lenp)
2832 long len = *lenp;
2833 long slen = -1L;
2834 long blen = RSTRING_LEN(str);
2835 rb_encoding *enc = STR_ENC_GET(str);
2836 char *p, *s = RSTRING_PTR(str), *e = s + blen;
2838 if (len < 0) return 0;
2839 if (!blen) {
2840 len = 0;
2842 if (single_byte_optimizable(str)) {
2843 if (beg > blen) return 0;
2844 if (beg < 0) {
2845 beg += blen;
2846 if (beg < 0) return 0;
2848 if (len > blen - beg)
2849 len = blen - beg;
2850 if (len < 0) return 0;
2851 p = s + beg;
2852 goto end;
2854 if (beg < 0) {
2855 if (len > -beg) len = -beg;
2856 if (-beg * rb_enc_mbmaxlen(enc) < RSTRING_LEN(str) / 8) {
2857 beg = -beg;
2858 while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
2859 p = e;
2860 if (!p) return 0;
2861 while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
2862 if (!p) return 0;
2863 len = e - p;
2864 goto end;
2866 else {
2867 slen = str_strlen(str, enc);
2868 beg += slen;
2869 if (beg < 0) return 0;
2870 p = s + beg;
2871 if (len == 0) goto end;
2874 else if (beg > 0 && beg > RSTRING_LEN(str)) {
2875 return 0;
2877 if (len == 0) {
2878 if (beg > str_strlen(str, enc)) return 0; /* str's enc */
2879 p = s + beg;
2881 #ifdef NONASCII_MASK
2882 else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
2883 enc == rb_utf8_encoding()) {
2884 p = str_utf8_nth(s, e, &beg);
2885 if (beg > 0) return 0;
2886 len = str_utf8_offset(p, e, len);
2888 #endif
2889 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2890 int char_sz = rb_enc_mbmaxlen(enc);
2892 p = s + beg * char_sz;
2893 if (p > e) {
2894 return 0;
2896 else if (len * char_sz > e - p)
2897 len = e - p;
2898 else
2899 len *= char_sz;
2901 else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
2902 if (beg > 0) return 0;
2903 len = 0;
2905 else {
2906 len = str_offset(p, e, len, enc, 0);
2908 end:
2909 *lenp = len;
2910 RB_GC_GUARD(str);
2911 return p;
2914 static VALUE str_substr(VALUE str, long beg, long len, int empty);
2916 VALUE
2917 rb_str_substr(VALUE str, long beg, long len)
2919 return str_substr(str, beg, len, TRUE);
2922 static VALUE
2923 str_substr(VALUE str, long beg, long len, int empty)
2925 VALUE str2;
2926 char *p = rb_str_subpos(str, beg, &len);
2928 if (!p) return Qnil;
2929 if (!STR_EMBEDDABLE_P(len, TERM_LEN(str)) &&
2930 SHARABLE_SUBSTRING_P(p, len, RSTRING_END(str))) {
2931 long ofs = p - RSTRING_PTR(str);
2932 str2 = rb_str_new_frozen(str);
2933 str2 = str_new_shared(rb_cString, str2);
2934 RSTRING(str2)->as.heap.ptr += ofs;
2935 RSTRING(str2)->as.heap.len = len;
2936 ENC_CODERANGE_CLEAR(str2);
2938 else {
2939 if (!len && !empty) return Qnil;
2940 str2 = rb_str_new(p, len);
2941 RB_GC_GUARD(str);
2943 rb_enc_cr_str_copy_for_substr(str2, str);
2945 return str2;
2948 VALUE
2949 rb_str_freeze(VALUE str)
2951 if (OBJ_FROZEN(str)) return str;
2952 rb_str_resize(str, RSTRING_LEN(str));
2953 return rb_obj_freeze(str);
2958 * call-seq:
2959 * +string -> new_string or self
2961 * Returns +self+ if +self+ is not frozen.
2963 * Otherwise. returns <tt>self.dup</tt>, which is not frozen.
2965 static VALUE
2966 str_uplus(VALUE str)
2968 if (OBJ_FROZEN(str)) {
2969 return rb_str_dup(str);
2971 else {
2972 return str;
2977 * call-seq:
2978 * -string -> frozen_string
2980 * Returns a frozen, possibly pre-existing copy of the string.
2982 * The returned \String will be deduplicated as long as it does not have
2983 * any instance variables set on it.
2985 static VALUE
2986 str_uminus(VALUE str)
2988 if (!BARE_STRING_P(str) && !rb_obj_frozen_p(str)) {
2989 str = rb_str_dup(str);
2991 return rb_fstring(str);
2994 RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str))
2995 #define rb_str_dup_frozen rb_str_new_frozen
2997 VALUE
2998 rb_str_locktmp(VALUE str)
3000 if (FL_TEST(str, STR_TMPLOCK)) {
3001 rb_raise(rb_eRuntimeError, "temporal locking already locked string");
3003 FL_SET(str, STR_TMPLOCK);
3004 return str;
3007 VALUE
3008 rb_str_unlocktmp(VALUE str)
3010 if (!FL_TEST(str, STR_TMPLOCK)) {
3011 rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
3013 FL_UNSET(str, STR_TMPLOCK);
3014 return str;
3017 RUBY_FUNC_EXPORTED VALUE
3018 rb_str_locktmp_ensure(VALUE str, VALUE (*func)(VALUE), VALUE arg)
3020 rb_str_locktmp(str);
3021 return rb_ensure(func, arg, rb_str_unlocktmp, str);
3024 void
3025 rb_str_set_len(VALUE str, long len)
3027 long capa;
3028 const int termlen = TERM_LEN(str);
3030 str_modifiable(str);
3031 if (STR_SHARED_P(str)) {
3032 rb_raise(rb_eRuntimeError, "can't set length of shared string");
3034 if (len > (capa = (long)str_capacity(str, termlen)) || len < 0) {
3035 rb_bug("probable buffer overflow: %ld for %ld", len, capa);
3037 STR_SET_LEN(str, len);
3038 TERM_FILL(&RSTRING_PTR(str)[len], termlen);
3041 VALUE
3042 rb_str_resize(VALUE str, long len)
3044 long slen;
3045 int independent;
3047 if (len < 0) {
3048 rb_raise(rb_eArgError, "negative string size (or size too big)");
3051 independent = str_independent(str);
3052 ENC_CODERANGE_CLEAR(str);
3053 slen = RSTRING_LEN(str);
3056 long capa;
3057 const int termlen = TERM_LEN(str);
3058 if (STR_EMBED_P(str)) {
3059 if (len == slen) return str;
3060 if (str_embed_capa(str) >= len + termlen) {
3061 STR_SET_EMBED_LEN(str, len);
3062 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3063 return str;
3065 str_make_independent_expand(str, slen, len - slen, termlen);
3067 else if (str_embed_capa(str) >= len + termlen) {
3068 char *ptr = STR_HEAP_PTR(str);
3069 STR_SET_EMBED(str);
3070 if (slen > len) slen = len;
3071 if (slen > 0) MEMCPY(RSTRING(str)->as.embed.ary, ptr, char, slen);
3072 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3073 STR_SET_EMBED_LEN(str, len);
3074 if (independent) ruby_xfree(ptr);
3075 return str;
3077 else if (!independent) {
3078 if (len == slen) return str;
3079 str_make_independent_expand(str, slen, len - slen, termlen);
3081 else if ((capa = RSTRING(str)->as.heap.aux.capa) < len ||
3082 (capa - len) > (len < 1024 ? len : 1024)) {
3083 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
3084 (size_t)len + termlen, STR_HEAP_SIZE(str));
3085 RSTRING(str)->as.heap.aux.capa = len;
3087 else if (len == slen) return str;
3088 RSTRING(str)->as.heap.len = len;
3089 TERM_FILL(RSTRING(str)->as.heap.ptr + len, termlen); /* sentinel */
3091 return str;
3094 static VALUE
3095 str_buf_cat(VALUE str, const char *ptr, long len)
3097 long capa, total, olen, off = -1;
3098 char *sptr;
3099 const int termlen = TERM_LEN(str);
3100 #if !USE_RVARGC
3101 assert(termlen < RSTRING_EMBED_LEN_MAX + 1); /* < (LONG_MAX/2) */
3102 #endif
3104 RSTRING_GETMEM(str, sptr, olen);
3105 if (ptr >= sptr && ptr <= sptr + olen) {
3106 off = ptr - sptr;
3108 rb_str_modify(str);
3109 if (len == 0) return 0;
3110 if (STR_EMBED_P(str)) {
3111 capa = str_embed_capa(str) - termlen;
3112 sptr = RSTRING(str)->as.embed.ary;
3113 olen = RSTRING_EMBED_LEN(str);
3115 else {
3116 capa = RSTRING(str)->as.heap.aux.capa;
3117 sptr = RSTRING(str)->as.heap.ptr;
3118 olen = RSTRING(str)->as.heap.len;
3120 if (olen > LONG_MAX - len) {
3121 rb_raise(rb_eArgError, "string sizes too big");
3123 total = olen + len;
3124 if (capa < total) {
3125 if (total >= LONG_MAX / 2) {
3126 capa = total;
3128 while (total > capa) {
3129 capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3131 RESIZE_CAPA_TERM(str, capa, termlen);
3132 sptr = RSTRING_PTR(str);
3134 if (off != -1) {
3135 ptr = sptr + off;
3137 memcpy(sptr + olen, ptr, len);
3138 STR_SET_LEN(str, total);
3139 TERM_FILL(sptr + total, termlen); /* sentinel */
3141 return str;
3144 #define str_buf_cat2(str, ptr) str_buf_cat((str), (ptr), strlen(ptr))
3146 VALUE
3147 rb_str_cat(VALUE str, const char *ptr, long len)
3149 if (len == 0) return str;
3150 if (len < 0) {
3151 rb_raise(rb_eArgError, "negative string size (or size too big)");
3153 return str_buf_cat(str, ptr, len);
3156 VALUE
3157 rb_str_cat_cstr(VALUE str, const char *ptr)
3159 must_not_null(ptr);
3160 return rb_str_buf_cat(str, ptr, strlen(ptr));
3163 RUBY_ALIAS_FUNCTION(rb_str_buf_cat(VALUE str, const char *ptr, long len), rb_str_cat, (str, ptr, len))
3164 RUBY_ALIAS_FUNCTION(rb_str_buf_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3165 RUBY_ALIAS_FUNCTION(rb_str_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3167 static VALUE
3168 rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
3169 int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
3171 int str_encindex = ENCODING_GET(str);
3172 int res_encindex;
3173 int str_cr, res_cr;
3174 rb_encoding *str_enc, *ptr_enc;
3176 str_cr = RSTRING_LEN(str) ? ENC_CODERANGE(str) : ENC_CODERANGE_7BIT;
3178 if (str_encindex == ptr_encindex) {
3179 if (str_cr != ENC_CODERANGE_UNKNOWN && ptr_cr == ENC_CODERANGE_UNKNOWN) {
3180 ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
3183 else {
3184 str_enc = rb_enc_from_index(str_encindex);
3185 ptr_enc = rb_enc_from_index(ptr_encindex);
3186 if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
3187 if (len == 0)
3188 return str;
3189 if (RSTRING_LEN(str) == 0) {
3190 rb_str_buf_cat(str, ptr, len);
3191 ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
3192 rb_str_change_terminator_length(str, rb_enc_mbminlen(str_enc), rb_enc_mbminlen(ptr_enc));
3193 return str;
3195 goto incompatible;
3197 if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
3198 ptr_cr = coderange_scan(ptr, len, ptr_enc);
3200 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3201 if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) {
3202 str_cr = rb_enc_str_coderange(str);
3206 if (ptr_cr_ret)
3207 *ptr_cr_ret = ptr_cr;
3209 if (str_encindex != ptr_encindex &&
3210 str_cr != ENC_CODERANGE_7BIT &&
3211 ptr_cr != ENC_CODERANGE_7BIT) {
3212 str_enc = rb_enc_from_index(str_encindex);
3213 ptr_enc = rb_enc_from_index(ptr_encindex);
3214 goto incompatible;
3217 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3218 res_encindex = str_encindex;
3219 res_cr = ENC_CODERANGE_UNKNOWN;
3221 else if (str_cr == ENC_CODERANGE_7BIT) {
3222 if (ptr_cr == ENC_CODERANGE_7BIT) {
3223 res_encindex = str_encindex;
3224 res_cr = ENC_CODERANGE_7BIT;
3226 else {
3227 res_encindex = ptr_encindex;
3228 res_cr = ptr_cr;
3231 else if (str_cr == ENC_CODERANGE_VALID) {
3232 res_encindex = str_encindex;
3233 if (ENC_CODERANGE_CLEAN_P(ptr_cr))
3234 res_cr = str_cr;
3235 else
3236 res_cr = ptr_cr;
3238 else { /* str_cr == ENC_CODERANGE_BROKEN */
3239 res_encindex = str_encindex;
3240 res_cr = str_cr;
3241 if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
3244 if (len < 0) {
3245 rb_raise(rb_eArgError, "negative string size (or size too big)");
3247 str_buf_cat(str, ptr, len);
3248 ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
3249 return str;
3251 incompatible:
3252 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
3253 rb_enc_name(str_enc), rb_enc_name(ptr_enc));
3254 UNREACHABLE_RETURN(Qundef);
3257 VALUE
3258 rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
3260 return rb_enc_cr_str_buf_cat(str, ptr, len,
3261 rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL);
3264 VALUE
3265 rb_str_buf_cat_ascii(VALUE str, const char *ptr)
3267 /* ptr must reference NUL terminated ASCII string. */
3268 int encindex = ENCODING_GET(str);
3269 rb_encoding *enc = rb_enc_from_index(encindex);
3270 if (rb_enc_asciicompat(enc)) {
3271 return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
3272 encindex, ENC_CODERANGE_7BIT, 0);
3274 else {
3275 char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
3276 while (*ptr) {
3277 unsigned int c = (unsigned char)*ptr;
3278 int len = rb_enc_codelen(c, enc);
3279 rb_enc_mbcput(c, buf, enc);
3280 rb_enc_cr_str_buf_cat(str, buf, len,
3281 encindex, ENC_CODERANGE_VALID, 0);
3282 ptr++;
3284 return str;
3288 VALUE
3289 rb_str_buf_append(VALUE str, VALUE str2)
3291 int str2_cr;
3293 str2_cr = ENC_CODERANGE(str2);
3295 rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
3296 ENCODING_GET(str2), str2_cr, &str2_cr);
3298 ENC_CODERANGE_SET(str2, str2_cr);
3300 return str;
3303 VALUE
3304 rb_str_append(VALUE str, VALUE str2)
3306 StringValue(str2);
3307 return rb_str_buf_append(str, str2);
3310 #define MIN_PRE_ALLOC_SIZE 48
3312 MJIT_FUNC_EXPORTED VALUE
3313 rb_str_concat_literals(size_t num, const VALUE *strary)
3315 VALUE str;
3316 size_t i, s;
3317 long len = 1;
3319 if (UNLIKELY(!num)) return rb_str_new(0, 0);
3320 if (UNLIKELY(num == 1)) return rb_str_resurrect(strary[0]);
3322 for (i = 0; i < num; ++i) { len += RSTRING_LEN(strary[i]); }
3323 if (LIKELY(len < MIN_PRE_ALLOC_SIZE)) {
3324 str = rb_str_resurrect(strary[0]);
3325 s = 1;
3327 else {
3328 str = rb_str_buf_new(len);
3329 rb_enc_copy(str, strary[0]);
3330 s = 0;
3333 for (i = s; i < num; ++i) {
3334 const VALUE v = strary[i];
3335 int encidx = ENCODING_GET(v);
3337 rb_enc_cr_str_buf_cat(str, RSTRING_PTR(v), RSTRING_LEN(v),
3338 encidx, ENC_CODERANGE(v), NULL);
3339 if (encidx != ENCINDEX_US_ASCII) {
3340 if (ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)
3341 rb_enc_set_index(str, encidx);
3344 return str;
3348 * call-seq:
3349 * concat(*objects) -> string
3351 * Concatenates each object in +objects+ to +self+ and returns +self+:
3353 * s = 'foo'
3354 * s.concat('bar', 'baz') # => "foobarbaz"
3355 * s # => "foobarbaz"
3357 * For each given object +object+ that is an \Integer,
3358 * the value is considered a codepoint and converted to a character before concatenation:
3360 * s = 'foo'
3361 * s.concat(32, 'bar', 32, 'baz') # => "foo bar baz"
3363 * Related: String#<<, which takes a single argument.
3365 static VALUE
3366 rb_str_concat_multi(int argc, VALUE *argv, VALUE str)
3368 str_modifiable(str);
3370 if (argc == 1) {
3371 return rb_str_concat(str, argv[0]);
3373 else if (argc > 1) {
3374 int i;
3375 VALUE arg_str = rb_str_tmp_new(0);
3376 rb_enc_copy(arg_str, str);
3377 for (i = 0; i < argc; i++) {
3378 rb_str_concat(arg_str, argv[i]);
3380 rb_str_buf_append(str, arg_str);
3383 return str;
3387 * call-seq:
3388 * string << object -> string
3390 * Concatenates +object+ to +self+ and returns +self+:
3392 * s = 'foo'
3393 * s << 'bar' # => "foobar"
3394 * s # => "foobar"
3396 * If +object+ is an \Integer,
3397 * the value is considered a codepoint and converted to a character before concatenation:
3399 * s = 'foo'
3400 * s << 33 # => "foo!"
3402 * Related: String#concat, which takes multiple arguments.
3404 VALUE
3405 rb_str_concat(VALUE str1, VALUE str2)
3407 unsigned int code;
3408 rb_encoding *enc = STR_ENC_GET(str1);
3409 int encidx;
3411 if (RB_INTEGER_TYPE_P(str2)) {
3412 if (rb_num_to_uint(str2, &code) == 0) {
3414 else if (FIXNUM_P(str2)) {
3415 rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2));
3417 else {
3418 rb_raise(rb_eRangeError, "bignum out of char range");
3421 else {
3422 return rb_str_append(str1, str2);
3425 encidx = rb_enc_to_index(enc);
3426 if (encidx == ENCINDEX_ASCII || encidx == ENCINDEX_US_ASCII) {
3427 /* US-ASCII automatically extended to ASCII-8BIT */
3428 char buf[1];
3429 buf[0] = (char)code;
3430 if (code > 0xFF) {
3431 rb_raise(rb_eRangeError, "%u out of char range", code);
3433 rb_str_cat(str1, buf, 1);
3434 if (encidx == ENCINDEX_US_ASCII && code > 127) {
3435 rb_enc_associate_index(str1, ENCINDEX_ASCII);
3436 ENC_CODERANGE_SET(str1, ENC_CODERANGE_VALID);
3439 else {
3440 long pos = RSTRING_LEN(str1);
3441 int cr = ENC_CODERANGE(str1);
3442 int len;
3443 char *buf;
3445 switch (len = rb_enc_codelen(code, enc)) {
3446 case ONIGERR_INVALID_CODE_POINT_VALUE:
3447 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
3448 break;
3449 case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
3450 case 0:
3451 rb_raise(rb_eRangeError, "%u out of char range", code);
3452 break;
3454 buf = ALLOCA_N(char, len + 1);
3455 rb_enc_mbcput(code, buf, enc);
3456 if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) {
3457 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
3459 rb_str_resize(str1, pos+len);
3460 memcpy(RSTRING_PTR(str1) + pos, buf, len);
3461 if (cr == ENC_CODERANGE_7BIT && code > 127)
3462 cr = ENC_CODERANGE_VALID;
3463 ENC_CODERANGE_SET(str1, cr);
3465 return str1;
3469 * call-seq:
3470 * prepend(*other_strings) -> string
3472 * Prepends each string in +other_strings+ to +self+ and returns +self+:
3474 * s = 'foo'
3475 * s.prepend('bar', 'baz') # => "barbazfoo"
3476 * s # => "barbazfoo"
3478 * Related: String#concat.
3481 static VALUE
3482 rb_str_prepend_multi(int argc, VALUE *argv, VALUE str)
3484 str_modifiable(str);
3486 if (argc == 1) {
3487 rb_str_update(str, 0L, 0L, argv[0]);
3489 else if (argc > 1) {
3490 int i;
3491 VALUE arg_str = rb_str_tmp_new(0);
3492 rb_enc_copy(arg_str, str);
3493 for (i = 0; i < argc; i++) {
3494 rb_str_append(arg_str, argv[i]);
3496 rb_str_update(str, 0L, 0L, arg_str);
3499 return str;
3502 st_index_t
3503 rb_str_hash(VALUE str)
3505 int e = ENCODING_GET(str);
3506 if (e && rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT) {
3507 e = 0;
3509 return rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str)) ^ e;
3513 rb_str_hash_cmp(VALUE str1, VALUE str2)
3515 long len1, len2;
3516 const char *ptr1, *ptr2;
3517 RSTRING_GETMEM(str1, ptr1, len1);
3518 RSTRING_GETMEM(str2, ptr2, len2);
3519 return (len1 != len2 ||
3520 !rb_str_comparable(str1, str2) ||
3521 memcmp(ptr1, ptr2, len1) != 0);
3525 * call-seq:
3526 * hash -> integer
3528 * Returns the integer hash value for +self+.
3529 * The value is based on the length, content and encoding of +self+.
3531 * Related: Object#hash.
3534 static VALUE
3535 rb_str_hash_m(VALUE str)
3537 st_index_t hval = rb_str_hash(str);
3538 return ST2FIX(hval);
3541 #define lesser(a,b) (((a)>(b))?(b):(a))
3544 rb_str_comparable(VALUE str1, VALUE str2)
3546 int idx1, idx2;
3547 int rc1, rc2;
3549 if (RSTRING_LEN(str1) == 0) return TRUE;
3550 if (RSTRING_LEN(str2) == 0) return TRUE;
3551 idx1 = ENCODING_GET(str1);
3552 idx2 = ENCODING_GET(str2);
3553 if (idx1 == idx2) return TRUE;
3554 rc1 = rb_enc_str_coderange(str1);
3555 rc2 = rb_enc_str_coderange(str2);
3556 if (rc1 == ENC_CODERANGE_7BIT) {
3557 if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
3558 if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
3559 return TRUE;
3561 if (rc2 == ENC_CODERANGE_7BIT) {
3562 if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
3563 return TRUE;
3565 return FALSE;
3569 rb_str_cmp(VALUE str1, VALUE str2)
3571 long len1, len2;
3572 const char *ptr1, *ptr2;
3573 int retval;
3575 if (str1 == str2) return 0;
3576 RSTRING_GETMEM(str1, ptr1, len1);
3577 RSTRING_GETMEM(str2, ptr2, len2);
3578 if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
3579 if (len1 == len2) {
3580 if (!rb_str_comparable(str1, str2)) {
3581 if (ENCODING_GET(str1) > ENCODING_GET(str2))
3582 return 1;
3583 return -1;
3585 return 0;
3587 if (len1 > len2) return 1;
3588 return -1;
3590 if (retval > 0) return 1;
3591 return -1;
3595 * call-seq:
3596 * string == object -> true or false
3597 * string === object -> true or false
3599 * Returns +true+ if +object+ has the same length and content;
3600 * as +self+; +false+ otherwise:
3602 * s = 'foo'
3603 * s == 'foo' # => true
3604 * s == 'food' # => false
3605 * s == 'FOO' # => false
3607 * Returns +false+ if the two strings' encodings are not compatible:
3608 * "\u{e4 f6 fc}".encode("ISO-8859-1") == ("\u{c4 d6 dc}") # => false
3610 * If +object+ is not an instance of \String but responds to +to_str+, then the
3611 * two strings are compared using <code>object.==</code>.
3614 VALUE
3615 rb_str_equal(VALUE str1, VALUE str2)
3617 if (str1 == str2) return Qtrue;
3618 if (!RB_TYPE_P(str2, T_STRING)) {
3619 if (!rb_respond_to(str2, idTo_str)) {
3620 return Qfalse;
3622 return rb_equal(str2, str1);
3624 return rb_str_eql_internal(str1, str2);
3628 * call-seq:
3629 * eql?(object) -> true or false
3631 * Returns +true+ if +object+ has the same length and content;
3632 * as +self+; +false+ otherwise:
3634 * s = 'foo'
3635 * s.eql?('foo') # => true
3636 * s.eql?('food') # => false
3637 * s.eql?('FOO') # => false
3639 * Returns +false+ if the two strings' encodings are not compatible:
3641 * "\u{e4 f6 fc}".encode("ISO-8859-1").eql?("\u{c4 d6 dc}") # => false
3645 MJIT_FUNC_EXPORTED VALUE
3646 rb_str_eql(VALUE str1, VALUE str2)
3648 if (str1 == str2) return Qtrue;
3649 if (!RB_TYPE_P(str2, T_STRING)) return Qfalse;
3650 return rb_str_eql_internal(str1, str2);
3654 * call-seq:
3655 * string <=> other_string -> -1, 0, 1, or nil
3657 * Compares +self+ and +other_string+, returning:
3659 * - -1 if +other_string+ is larger.
3660 * - 0 if the two are equal.
3661 * - 1 if +other_string+ is smaller.
3662 * - +nil+ if the two are incomparable.
3664 * Examples:
3666 * 'foo' <=> 'foo' # => 0
3667 * 'foo' <=> 'food' # => -1
3668 * 'food' <=> 'foo' # => 1
3669 * 'FOO' <=> 'foo' # => -1
3670 * 'foo' <=> 'FOO' # => 1
3671 * 'foo' <=> 1 # => nil
3675 static VALUE
3676 rb_str_cmp_m(VALUE str1, VALUE str2)
3678 int result;
3679 VALUE s = rb_check_string_type(str2);
3680 if (NIL_P(s)) {
3681 return rb_invcmp(str1, str2);
3683 result = rb_str_cmp(str1, s);
3684 return INT2FIX(result);
3687 static VALUE str_casecmp(VALUE str1, VALUE str2);
3688 static VALUE str_casecmp_p(VALUE str1, VALUE str2);
3691 * call-seq:
3692 * casecmp(other_string) -> -1, 0, 1, or nil
3694 * Compares <tt>self.downcase</tt> and <tt>other_string.downcase</tt>; returns:
3696 * - -1 if <tt>other_string.downcase</tt> is larger.
3697 * - 0 if the two are equal.
3698 * - 1 if <tt>other_string.downcase</tt> is smaller.
3699 * - +nil+ if the two are incomparable.
3701 * Examples:
3703 * 'foo'.casecmp('foo') # => 0
3704 * 'foo'.casecmp('food') # => -1
3705 * 'food'.casecmp('foo') # => 1
3706 * 'FOO'.casecmp('foo') # => 0
3707 * 'foo'.casecmp('FOO') # => 0
3708 * 'foo'.casecmp(1) # => nil
3710 * See {Case Mapping}[doc/case_mapping_rdoc.html].
3712 * Related: String#casecmp?.
3716 static VALUE
3717 rb_str_casecmp(VALUE str1, VALUE str2)
3719 VALUE s = rb_check_string_type(str2);
3720 if (NIL_P(s)) {
3721 return Qnil;
3723 return str_casecmp(str1, s);
3726 static VALUE
3727 str_casecmp(VALUE str1, VALUE str2)
3729 long len;
3730 rb_encoding *enc;
3731 const char *p1, *p1end, *p2, *p2end;
3733 enc = rb_enc_compatible(str1, str2);
3734 if (!enc) {
3735 return Qnil;
3738 p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
3739 p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
3740 if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
3741 while (p1 < p1end && p2 < p2end) {
3742 if (*p1 != *p2) {
3743 unsigned int c1 = TOLOWER(*p1 & 0xff);
3744 unsigned int c2 = TOLOWER(*p2 & 0xff);
3745 if (c1 != c2)
3746 return INT2FIX(c1 < c2 ? -1 : 1);
3748 p1++;
3749 p2++;
3752 else {
3753 while (p1 < p1end && p2 < p2end) {
3754 int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
3755 int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
3757 if (0 <= c1 && 0 <= c2) {
3758 c1 = TOLOWER(c1);
3759 c2 = TOLOWER(c2);
3760 if (c1 != c2)
3761 return INT2FIX(c1 < c2 ? -1 : 1);
3763 else {
3764 int r;
3765 l1 = rb_enc_mbclen(p1, p1end, enc);
3766 l2 = rb_enc_mbclen(p2, p2end, enc);
3767 len = l1 < l2 ? l1 : l2;
3768 r = memcmp(p1, p2, len);
3769 if (r != 0)
3770 return INT2FIX(r < 0 ? -1 : 1);
3771 if (l1 != l2)
3772 return INT2FIX(l1 < l2 ? -1 : 1);
3774 p1 += l1;
3775 p2 += l2;
3778 if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0);
3779 if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1);
3780 return INT2FIX(-1);
3784 * call-seq:
3785 * casecmp?(other_string) -> true, false, or nil
3787 * Returns +true+ if +self+ and +other_string+ are equal after
3788 * Unicode case folding, otherwise +false+:
3790 * 'foo'.casecmp?('foo') # => true
3791 * 'foo'.casecmp?('food') # => false
3792 * 'food'.casecmp?('foo') # => false
3793 * 'FOO'.casecmp?('foo') # => true
3794 * 'foo'.casecmp?('FOO') # => true
3796 * Returns +nil+ if the two values are incomparable:
3798 * 'foo'.casecmp?(1) # => nil
3800 * See {Case Mapping}[doc/case_mapping_rdoc.html].
3802 * Related: String#casecmp.
3806 static VALUE
3807 rb_str_casecmp_p(VALUE str1, VALUE str2)
3809 VALUE s = rb_check_string_type(str2);
3810 if (NIL_P(s)) {
3811 return Qnil;
3813 return str_casecmp_p(str1, s);
3816 static VALUE
3817 str_casecmp_p(VALUE str1, VALUE str2)
3819 rb_encoding *enc;
3820 VALUE folded_str1, folded_str2;
3821 VALUE fold_opt = sym_fold;
3823 enc = rb_enc_compatible(str1, str2);
3824 if (!enc) {
3825 return Qnil;
3828 folded_str1 = rb_str_downcase(1, &fold_opt, str1);
3829 folded_str2 = rb_str_downcase(1, &fold_opt, str2);
3831 return rb_str_eql(folded_str1, folded_str2);
3834 static long
3835 strseq_core(const char *str_ptr, const char *str_ptr_end, long str_len,
3836 const char *sub_ptr, long sub_len, long offset, rb_encoding *enc)
3838 const char *search_start = str_ptr;
3839 long pos, search_len = str_len - offset;
3841 for (;;) {
3842 const char *t;
3843 pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
3844 if (pos < 0) return pos;
3845 t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc);
3846 if (t == search_start + pos) break;
3847 search_len -= t - search_start;
3848 if (search_len <= 0) return -1;
3849 offset += t - search_start;
3850 search_start = t;
3852 return pos + offset;
3855 #define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
3857 static long
3858 rb_strseq_index(VALUE str, VALUE sub, long offset, int in_byte)
3860 const char *str_ptr, *str_ptr_end, *sub_ptr;
3861 long str_len, sub_len;
3862 rb_encoding *enc;
3864 enc = rb_enc_check(str, sub);
3865 if (is_broken_string(sub)) return -1;
3867 str_ptr = RSTRING_PTR(str);
3868 str_ptr_end = RSTRING_END(str);
3869 str_len = RSTRING_LEN(str);
3870 sub_ptr = RSTRING_PTR(sub);
3871 sub_len = RSTRING_LEN(sub);
3873 if (str_len < sub_len) return -1;
3875 if (offset != 0) {
3876 long str_len_char, sub_len_char;
3877 int single_byte = single_byte_optimizable(str);
3878 str_len_char = (in_byte || single_byte) ? str_len : str_strlen(str, enc);
3879 sub_len_char = in_byte ? sub_len : str_strlen(sub, enc);
3880 if (offset < 0) {
3881 offset += str_len_char;
3882 if (offset < 0) return -1;
3884 if (str_len_char - offset < sub_len_char) return -1;
3885 if (!in_byte) offset = str_offset(str_ptr, str_ptr_end, offset, enc, single_byte);
3886 str_ptr += offset;
3888 if (sub_len == 0) return offset;
3890 /* need proceed one character at a time */
3891 return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc);
3896 * call-seq:
3897 * index(substring, offset = 0) -> integer or nil
3898 * index(regexp, offset = 0) -> integer or nil
3900 * Returns the \Integer index of the first occurrence of the given +substring+,
3901 * or +nil+ if none found:
3903 * 'foo'.index('f') # => 0
3904 * 'foo'.index('o') # => 1
3905 * 'foo'.index('oo') # => 1
3906 * 'foo'.index('ooo') # => nil
3908 * Returns the \Integer index of the first match for the given \Regexp +regexp+,
3909 * or +nil+ if none found:
3911 * 'foo'.index(/f/) # => 0
3912 * 'foo'.index(/o/) # => 1
3913 * 'foo'.index(/oo/) # => 1
3914 * 'foo'.index(/ooo/) # => nil
3916 * \Integer argument +offset+, if given, specifies the position in the
3917 * string to begin the search:
3919 * 'foo'.index('o', 1) # => 1
3920 * 'foo'.index('o', 2) # => 2
3921 * 'foo'.index('o', 3) # => nil
3923 * If +offset+ is negative, counts backward from the end of +self+:
3925 * 'foo'.index('o', -1) # => 2
3926 * 'foo'.index('o', -2) # => 1
3927 * 'foo'.index('o', -3) # => 1
3928 * 'foo'.index('o', -4) # => nil
3930 * Related: String#rindex.
3933 static VALUE
3934 rb_str_index_m(int argc, VALUE *argv, VALUE str)
3936 VALUE sub;
3937 VALUE initpos;
3938 long pos;
3940 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
3941 pos = NUM2LONG(initpos);
3943 else {
3944 pos = 0;
3946 if (pos < 0) {
3947 pos += str_strlen(str, NULL);
3948 if (pos < 0) {
3949 if (RB_TYPE_P(sub, T_REGEXP)) {
3950 rb_backref_set(Qnil);
3952 return Qnil;
3956 if (RB_TYPE_P(sub, T_REGEXP)) {
3957 if (pos > str_strlen(str, NULL))
3958 return Qnil;
3959 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
3960 rb_enc_check(str, sub), single_byte_optimizable(str));
3962 if (rb_reg_search(sub, str, pos, 0) < 0) {
3963 return Qnil;
3965 else {
3966 VALUE match = rb_backref_get();
3967 struct re_registers *regs = RMATCH_REGS(match);
3968 pos = rb_str_sublen(str, BEG(0));
3969 return LONG2NUM(pos);
3972 else {
3973 StringValue(sub);
3974 pos = rb_str_index(str, sub, pos);
3975 pos = rb_str_sublen(str, pos);
3978 if (pos == -1) return Qnil;
3979 return LONG2NUM(pos);
3982 #ifdef HAVE_MEMRCHR
3983 static long
3984 str_rindex(VALUE str, VALUE sub, const char *s, long pos, rb_encoding *enc)
3986 char *hit, *adjusted;
3987 int c;
3988 long slen, searchlen;
3989 char *sbeg, *e, *t;
3991 slen = RSTRING_LEN(sub);
3992 if (slen == 0) return pos;
3993 sbeg = RSTRING_PTR(str);
3994 e = RSTRING_END(str);
3995 t = RSTRING_PTR(sub);
3996 c = *t & 0xff;
3997 searchlen = s - sbeg + 1;
3999 do {
4000 hit = memrchr(sbeg, c, searchlen);
4001 if (!hit) break;
4002 adjusted = rb_enc_left_char_head(sbeg, hit, e, enc);
4003 if (hit != adjusted) {
4004 searchlen = adjusted - sbeg;
4005 continue;
4007 if (memcmp(hit, t, slen) == 0)
4008 return rb_str_sublen(str, hit - sbeg);
4009 searchlen = adjusted - sbeg;
4010 } while (searchlen > 0);
4012 return -1;
4014 #else
4015 static long
4016 str_rindex(VALUE str, VALUE sub, const char *s, long pos, rb_encoding *enc)
4018 long slen;
4019 char *sbeg, *e, *t;
4021 sbeg = RSTRING_PTR(str);
4022 e = RSTRING_END(str);
4023 t = RSTRING_PTR(sub);
4024 slen = RSTRING_LEN(sub);
4026 while (s) {
4027 if (memcmp(s, t, slen) == 0) {
4028 return pos;
4030 if (pos == 0) break;
4031 pos--;
4032 s = rb_enc_prev_char(sbeg, s, e, enc);
4035 return -1;
4037 #endif
4039 static long
4040 rb_str_rindex(VALUE str, VALUE sub, long pos)
4042 long len, slen;
4043 char *sbeg, *s;
4044 rb_encoding *enc;
4045 int singlebyte;
4047 enc = rb_enc_check(str, sub);
4048 if (is_broken_string(sub)) return -1;
4049 singlebyte = single_byte_optimizable(str);
4050 len = singlebyte ? RSTRING_LEN(str) : str_strlen(str, enc); /* rb_enc_check */
4051 slen = str_strlen(sub, enc); /* rb_enc_check */
4053 /* substring longer than string */
4054 if (len < slen) return -1;
4055 if (len - pos < slen) pos = len - slen;
4056 if (len == 0) return pos;
4058 sbeg = RSTRING_PTR(str);
4060 if (pos == 0) {
4061 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4062 return 0;
4063 else
4064 return -1;
4067 s = str_nth(sbeg, RSTRING_END(str), pos, enc, singlebyte);
4068 return str_rindex(str, sub, s, pos, enc);
4072 * call-seq:
4073 * rindex(substring, offset = self.length) -> integer or nil
4074 * rindex(regexp, offset = self.length) -> integer or nil
4076 * Returns the \Integer index of the _last_ occurrence of the given +substring+,
4077 * or +nil+ if none found:
4079 * 'foo'.rindex('f') # => 0
4080 * 'foo'.rindex('o') # => 2
4081 * 'foo'.rindex('oo') # => 1
4082 * 'foo'.rindex('ooo') # => nil
4084 * Returns the \Integer index of the _last_ match for the given \Regexp +regexp+,
4085 * or +nil+ if none found:
4087 * 'foo'.rindex(/f/) # => 0
4088 * 'foo'.rindex(/o/) # => 2
4089 * 'foo'.rindex(/oo/) # => 1
4090 * 'foo'.rindex(/ooo/) # => nil
4092 * The _last_ match means starting at the possible last position, not
4093 * the last of longest matches.
4095 * 'foo'.rindex(/o+/) # => 2
4096 * $~ #=> #<MatchData "o">
4098 * To get the last longest match, needs to combine with negative
4099 * lookbehind.
4101 * 'foo'.rindex(/(?<!o)o+/) # => 1
4102 * $~ #=> #<MatchData "oo">
4104 * Or String#index with negative lookforward.
4106 * 'foo'.index(/o+(?!.*o)/) # => 1
4107 * $~ #=> #<MatchData "oo">
4109 * \Integer argument +offset+, if given and non-negative, specifies the maximum starting position in the
4110 * string to _end_ the search:
4112 * 'foo'.rindex('o', 0) # => nil
4113 * 'foo'.rindex('o', 1) # => 1
4114 * 'foo'.rindex('o', 2) # => 2
4115 * 'foo'.rindex('o', 3) # => 2
4117 * If +offset+ is a negative \Integer, the maximum starting position in the
4118 * string to _end_ the search is the sum of the string's length and +offset+:
4120 * 'foo'.rindex('o', -1) # => 2
4121 * 'foo'.rindex('o', -2) # => 1
4122 * 'foo'.rindex('o', -3) # => nil
4123 * 'foo'.rindex('o', -4) # => nil
4125 * Related: String#index.
4128 static VALUE
4129 rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
4131 VALUE sub;
4132 VALUE vpos;
4133 rb_encoding *enc = STR_ENC_GET(str);
4134 long pos, len = str_strlen(str, enc); /* str's enc */
4136 if (rb_scan_args(argc, argv, "11", &sub, &vpos) == 2) {
4137 pos = NUM2LONG(vpos);
4138 if (pos < 0) {
4139 pos += len;
4140 if (pos < 0) {
4141 if (RB_TYPE_P(sub, T_REGEXP)) {
4142 rb_backref_set(Qnil);
4144 return Qnil;
4147 if (pos > len) pos = len;
4149 else {
4150 pos = len;
4153 if (RB_TYPE_P(sub, T_REGEXP)) {
4154 /* enc = rb_get_check(str, sub); */
4155 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4156 enc, single_byte_optimizable(str));
4158 if (rb_reg_search(sub, str, pos, 1) >= 0) {
4159 VALUE match = rb_backref_get();
4160 struct re_registers *regs = RMATCH_REGS(match);
4161 pos = rb_str_sublen(str, BEG(0));
4162 return LONG2NUM(pos);
4165 else {
4166 StringValue(sub);
4167 pos = rb_str_rindex(str, sub, pos);
4168 if (pos >= 0) return LONG2NUM(pos);
4170 return Qnil;
4174 * call-seq:
4175 * string =~ regexp -> integer or nil
4176 * string =~ object -> integer or nil
4178 * Returns the \Integer index of the first substring that matches
4179 * the given +regexp+, or +nil+ if no match found:
4181 * 'foo' =~ /f/ # => 0
4182 * 'foo' =~ /o/ # => 1
4183 * 'foo' =~ /x/ # => nil
4185 * Note: also updates
4186 * {Regexp-related global variables}[Regexp.html#class-Regexp-label-Special+global+variables].
4188 * If the given +object+ is not a \Regexp, returns the value
4189 * returned by <tt>object =~ self</tt>.
4191 * Note that <tt>string =~ regexp</tt> is different from <tt>regexp =~ string</tt>
4192 * (see {Regexp#=~}[https://ruby-doc.org/core-2.7.1/Regexp.html#method-i-3D-7E]):
4194 * number= nil
4195 * "no. 9" =~ /(?<number>\d+)/
4196 * number # => nil (not assigned)
4197 * /(?<number>\d+)/ =~ "no. 9"
4198 * number #=> "9"
4202 static VALUE
4203 rb_str_match(VALUE x, VALUE y)
4205 switch (OBJ_BUILTIN_TYPE(y)) {
4206 case T_STRING:
4207 rb_raise(rb_eTypeError, "type mismatch: String given");
4209 case T_REGEXP:
4210 return rb_reg_match(y, x);
4212 default:
4213 return rb_funcall(y, idEqTilde, 1, x);
4218 static VALUE get_pat(VALUE);
4222 * call-seq:
4223 * match(pattern, offset = 0) -> matchdata or nil
4224 * match(pattern, offset = 0) {|matchdata| ... } -> object
4226 * Returns a \Matchdata object (or +nil+) based on +self+ and the given +pattern+.
4228 * Note: also updates
4229 * {Regexp-related global variables}[Regexp.html#class-Regexp-label-Special+global+variables].
4231 * - Computes +regexp+ by converting +pattern+ (if not already a \Regexp).
4232 * regexp = Regexp.new(pattern)
4233 * - Computes +matchdata+, which will be either a \MatchData object or +nil+
4234 * (see Regexp#match):
4235 * matchdata = <tt>regexp.match(self)
4237 * With no block given, returns the computed +matchdata+:
4239 * 'foo'.match('f') # => #<MatchData "f">
4240 * 'foo'.match('o') # => #<MatchData "o">
4241 * 'foo'.match('x') # => nil
4243 * If \Integer argument +offset+ is given, the search begins at index +offset+:
4245 * 'foo'.match('f', 1) # => nil
4246 * 'foo'.match('o', 1) # => #<MatchData "o">
4248 * With a block given, calls the block with the computed +matchdata+
4249 * and returns the block's return value:
4251 * 'foo'.match(/o/) {|matchdata| matchdata } # => #<MatchData "o">
4252 * 'foo'.match(/x/) {|matchdata| matchdata } # => nil
4253 * 'foo'.match(/f/, 1) {|matchdata| matchdata } # => nil
4257 static VALUE
4258 rb_str_match_m(int argc, VALUE *argv, VALUE str)
4260 VALUE re, result;
4261 if (argc < 1)
4262 rb_check_arity(argc, 1, 2);
4263 re = argv[0];
4264 argv[0] = str;
4265 result = rb_funcallv(get_pat(re), rb_intern("match"), argc, argv);
4266 if (!NIL_P(result) && rb_block_given_p()) {
4267 return rb_yield(result);
4269 return result;
4273 * call-seq:
4274 * match?(pattern, offset = 0) -> true or false
4276 * Returns +true+ or +false+ based on whether a match is found for +self+ and +pattern+.
4278 * Note: does not update
4279 * {Regexp-related global variables}[Regexp.html#class-Regexp-label-Special+global+variables].
4281 * Computes +regexp+ by converting +pattern+ (if not already a \Regexp).
4282 * regexp = Regexp.new(pattern)
4284 * Returns +true+ if <tt>self+.match(regexp)</tt> returns a \Matchdata object,
4285 * +false+ otherwise:
4287 * 'foo'.match?(/o/) # => true
4288 * 'foo'.match?('o') # => true
4289 * 'foo'.match?(/x/) # => false
4291 * If \Integer argument +offset+ is given, the search begins at index +offset+:
4292 * 'foo'.match?('f', 1) # => false
4293 * 'foo'.match?('o', 1) # => true
4297 static VALUE
4298 rb_str_match_m_p(int argc, VALUE *argv, VALUE str)
4300 VALUE re;
4301 rb_check_arity(argc, 1, 2);
4302 re = get_pat(argv[0]);
4303 return rb_reg_match_p(re, str, argc > 1 ? NUM2LONG(argv[1]) : 0);
4306 enum neighbor_char {
4307 NEIGHBOR_NOT_CHAR,
4308 NEIGHBOR_FOUND,
4309 NEIGHBOR_WRAPPED
4312 static enum neighbor_char
4313 enc_succ_char(char *p, long len, rb_encoding *enc)
4315 long i;
4316 int l;
4318 if (rb_enc_mbminlen(enc) > 1) {
4319 /* wchar, trivial case */
4320 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
4321 if (!MBCLEN_CHARFOUND_P(r)) {
4322 return NEIGHBOR_NOT_CHAR;
4324 c = rb_enc_mbc_to_codepoint(p, p + len, enc) + 1;
4325 l = rb_enc_code_to_mbclen(c, enc);
4326 if (!l) return NEIGHBOR_NOT_CHAR;
4327 if (l != len) return NEIGHBOR_WRAPPED;
4328 rb_enc_mbcput(c, p, enc);
4329 r = rb_enc_precise_mbclen(p, p + len, enc);
4330 if (!MBCLEN_CHARFOUND_P(r)) {
4331 return NEIGHBOR_NOT_CHAR;
4333 return NEIGHBOR_FOUND;
4335 while (1) {
4336 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
4337 p[i] = '\0';
4338 if (i < 0)
4339 return NEIGHBOR_WRAPPED;
4340 ++((unsigned char*)p)[i];
4341 l = rb_enc_precise_mbclen(p, p+len, enc);
4342 if (MBCLEN_CHARFOUND_P(l)) {
4343 l = MBCLEN_CHARFOUND_LEN(l);
4344 if (l == len) {
4345 return NEIGHBOR_FOUND;
4347 else {
4348 memset(p+l, 0xff, len-l);
4351 if (MBCLEN_INVALID_P(l) && i < len-1) {
4352 long len2;
4353 int l2;
4354 for (len2 = len-1; 0 < len2; len2--) {
4355 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
4356 if (!MBCLEN_INVALID_P(l2))
4357 break;
4359 memset(p+len2+1, 0xff, len-(len2+1));
4364 static enum neighbor_char
4365 enc_pred_char(char *p, long len, rb_encoding *enc)
4367 long i;
4368 int l;
4369 if (rb_enc_mbminlen(enc) > 1) {
4370 /* wchar, trivial case */
4371 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
4372 if (!MBCLEN_CHARFOUND_P(r)) {
4373 return NEIGHBOR_NOT_CHAR;
4375 c = rb_enc_mbc_to_codepoint(p, p + len, enc);
4376 if (!c) return NEIGHBOR_NOT_CHAR;
4377 --c;
4378 l = rb_enc_code_to_mbclen(c, enc);
4379 if (!l) return NEIGHBOR_NOT_CHAR;
4380 if (l != len) return NEIGHBOR_WRAPPED;
4381 rb_enc_mbcput(c, p, enc);
4382 r = rb_enc_precise_mbclen(p, p + len, enc);
4383 if (!MBCLEN_CHARFOUND_P(r)) {
4384 return NEIGHBOR_NOT_CHAR;
4386 return NEIGHBOR_FOUND;
4388 while (1) {
4389 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
4390 p[i] = '\xff';
4391 if (i < 0)
4392 return NEIGHBOR_WRAPPED;
4393 --((unsigned char*)p)[i];
4394 l = rb_enc_precise_mbclen(p, p+len, enc);
4395 if (MBCLEN_CHARFOUND_P(l)) {
4396 l = MBCLEN_CHARFOUND_LEN(l);
4397 if (l == len) {
4398 return NEIGHBOR_FOUND;
4400 else {
4401 memset(p+l, 0, len-l);
4404 if (MBCLEN_INVALID_P(l) && i < len-1) {
4405 long len2;
4406 int l2;
4407 for (len2 = len-1; 0 < len2; len2--) {
4408 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
4409 if (!MBCLEN_INVALID_P(l2))
4410 break;
4412 memset(p+len2+1, 0, len-(len2+1));
4418 overwrite +p+ by succeeding letter in +enc+ and returns
4419 NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
4420 When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
4421 assuming each ranges are successive, and mbclen
4422 never change in each ranges.
4423 NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
4424 character.
4426 static enum neighbor_char
4427 enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
4429 enum neighbor_char ret;
4430 unsigned int c;
4431 int ctype;
4432 int range;
4433 char save[ONIGENC_CODE_TO_MBC_MAXLEN];
4435 /* skip 03A2, invalid char between GREEK CAPITAL LETTERS */
4436 int try;
4437 const int max_gaps = 1;
4439 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
4440 if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
4441 ctype = ONIGENC_CTYPE_DIGIT;
4442 else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
4443 ctype = ONIGENC_CTYPE_ALPHA;
4444 else
4445 return NEIGHBOR_NOT_CHAR;
4447 MEMCPY(save, p, char, len);
4448 for (try = 0; try <= max_gaps; ++try) {
4449 ret = enc_succ_char(p, len, enc);
4450 if (ret == NEIGHBOR_FOUND) {
4451 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
4452 if (rb_enc_isctype(c, ctype, enc))
4453 return NEIGHBOR_FOUND;
4456 MEMCPY(p, save, char, len);
4457 range = 1;
4458 while (1) {
4459 MEMCPY(save, p, char, len);
4460 ret = enc_pred_char(p, len, enc);
4461 if (ret == NEIGHBOR_FOUND) {
4462 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
4463 if (!rb_enc_isctype(c, ctype, enc)) {
4464 MEMCPY(p, save, char, len);
4465 break;
4468 else {
4469 MEMCPY(p, save, char, len);
4470 break;
4472 range++;
4474 if (range == 1) {
4475 return NEIGHBOR_NOT_CHAR;
4478 if (ctype != ONIGENC_CTYPE_DIGIT) {
4479 MEMCPY(carry, p, char, len);
4480 return NEIGHBOR_WRAPPED;
4483 MEMCPY(carry, p, char, len);
4484 enc_succ_char(carry, len, enc);
4485 return NEIGHBOR_WRAPPED;
4489 static VALUE str_succ(VALUE str);
4492 * call-seq:
4493 * succ -> new_str
4495 * Returns the successor to +self+. The successor is calculated by
4496 * incrementing characters.
4498 * The first character to be incremented is the rightmost alphanumeric:
4499 * or, if no alphanumerics, the rightmost character:
4501 * 'THX1138'.succ # => "THX1139"
4502 * '<<koala>>'.succ # => "<<koalb>>"
4503 * '***'.succ # => '**+'
4505 * The successor to a digit is another digit, "carrying" to the next-left
4506 * character for a "rollover" from 9 to 0, and prepending another digit
4507 * if necessary:
4509 * '00'.succ # => "01"
4510 * '09'.succ # => "10"
4511 * '99'.succ # => "100"
4513 * The successor to a letter is another letter of the same case,
4514 * carrying to the next-left character for a rollover,
4515 * and prepending another same-case letter if necessary:
4517 * 'aa'.succ # => "ab"
4518 * 'az'.succ # => "ba"
4519 * 'zz'.succ # => "aaa"
4520 * 'AA'.succ # => "AB"
4521 * 'AZ'.succ # => "BA"
4522 * 'ZZ'.succ # => "AAA"
4524 * The successor to a non-alphanumeric character is the next character
4525 * in the underlying character set's collating sequence,
4526 * carrying to the next-left character for a rollover,
4527 * and prepending another character if necessary:
4529 * s = 0.chr * 3
4530 * s # => "\x00\x00\x00"
4531 * s.succ # => "\x00\x00\x01"
4532 * s = 255.chr * 3
4533 * s # => "\xFF\xFF\xFF"
4534 * s.succ # => "\x01\x00\x00\x00"
4536 * Carrying can occur between and among mixtures of alphanumeric characters:
4538 * s = 'zz99zz99'
4539 * s.succ # => "aaa00aa00"
4540 * s = '99zz99zz'
4541 * s.succ # => "100aa00aa"
4543 * The successor to an empty \String is a new empty \String:
4545 * ''.succ # => ""
4547 * String#next is an alias for String#succ.
4550 VALUE
4551 rb_str_succ(VALUE orig)
4553 VALUE str;
4554 str = rb_str_new(RSTRING_PTR(orig), RSTRING_LEN(orig));
4555 rb_enc_cr_str_copy_for_substr(str, orig);
4556 return str_succ(str);
4559 static VALUE
4560 str_succ(VALUE str)
4562 rb_encoding *enc;
4563 char *sbeg, *s, *e, *last_alnum = 0;
4564 int found_alnum = 0;
4565 long l, slen;
4566 char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
4567 long carry_pos = 0, carry_len = 1;
4568 enum neighbor_char neighbor = NEIGHBOR_FOUND;
4570 slen = RSTRING_LEN(str);
4571 if (slen == 0) return str;
4573 enc = STR_ENC_GET(str);
4574 sbeg = RSTRING_PTR(str);
4575 s = e = sbeg + slen;
4577 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
4578 if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
4579 if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
4580 ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
4581 break;
4584 l = rb_enc_precise_mbclen(s, e, enc);
4585 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
4586 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
4587 neighbor = enc_succ_alnum_char(s, l, enc, carry);
4588 switch (neighbor) {
4589 case NEIGHBOR_NOT_CHAR:
4590 continue;
4591 case NEIGHBOR_FOUND:
4592 return str;
4593 case NEIGHBOR_WRAPPED:
4594 last_alnum = s;
4595 break;
4597 found_alnum = 1;
4598 carry_pos = s - sbeg;
4599 carry_len = l;
4601 if (!found_alnum) { /* str contains no alnum */
4602 s = e;
4603 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
4604 enum neighbor_char neighbor;
4605 char tmp[ONIGENC_CODE_TO_MBC_MAXLEN];
4606 l = rb_enc_precise_mbclen(s, e, enc);
4607 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
4608 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
4609 MEMCPY(tmp, s, char, l);
4610 neighbor = enc_succ_char(tmp, l, enc);
4611 switch (neighbor) {
4612 case NEIGHBOR_FOUND:
4613 MEMCPY(s, tmp, char, l);
4614 return str;
4615 break;
4616 case NEIGHBOR_WRAPPED:
4617 MEMCPY(s, tmp, char, l);
4618 break;
4619 case NEIGHBOR_NOT_CHAR:
4620 break;
4622 if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
4623 /* wrapped to \0...\0. search next valid char. */
4624 enc_succ_char(s, l, enc);
4626 if (!rb_enc_asciicompat(enc)) {
4627 MEMCPY(carry, s, char, l);
4628 carry_len = l;
4630 carry_pos = s - sbeg;
4632 ENC_CODERANGE_SET(str, ENC_CODERANGE_UNKNOWN);
4634 RESIZE_CAPA(str, slen + carry_len);
4635 sbeg = RSTRING_PTR(str);
4636 s = sbeg + carry_pos;
4637 memmove(s + carry_len, s, slen - carry_pos);
4638 memmove(s, carry, carry_len);
4639 slen += carry_len;
4640 STR_SET_LEN(str, slen);
4641 TERM_FILL(&sbeg[slen], rb_enc_mbminlen(enc));
4642 rb_enc_str_coderange(str);
4643 return str;
4648 * call-seq:
4649 * succ! -> self
4651 * Equivalent to String#succ, but modifies +self+ in place; returns +self+.
4653 * String#next! is an alias for String#succ!.
4656 static VALUE
4657 rb_str_succ_bang(VALUE str)
4659 rb_str_modify(str);
4660 str_succ(str);
4661 return str;
4664 static int
4665 all_digits_p(const char *s, long len)
4667 while (len-- > 0) {
4668 if (!ISDIGIT(*s)) return 0;
4669 s++;
4671 return 1;
4674 static int
4675 str_upto_i(VALUE str, VALUE arg)
4677 rb_yield(str);
4678 return 0;
4682 * call-seq:
4683 * upto(other_string, exclusive = false) {|string| ... } -> self
4684 * upto(other_string, exclusive = false) -> new_enumerator
4686 * With a block given, calls the block with each \String value
4687 * returned by successive calls to String#succ;
4688 * the first value is +self+, the next is <tt>self.succ</tt>, and so on;
4689 * the sequence terminates when value +other_string+ is reached;
4690 * returns +self+:
4692 * 'a8'.upto('b6') {|s| print s, ' ' } # => "a8"
4693 * Output:
4695 * a8 a9 b0 b1 b2 b3 b4 b5 b6
4697 * If argument +exclusive+ is given as a truthy object, the last value is omitted:
4699 * 'a8'.upto('b6', true) {|s| print s, ' ' } # => "a8"
4701 * Output:
4703 * a8 a9 b0 b1 b2 b3 b4 b5
4705 * If +other_string+ would not be reached, does not call the block:
4707 * '25'.upto('5') {|s| fail s }
4708 * 'aa'.upto('a') {|s| fail s }
4710 * With no block given, returns a new \Enumerator:
4712 * 'a8'.upto('b6') # => #<Enumerator: "a8":upto("b6")>
4716 static VALUE
4717 rb_str_upto(int argc, VALUE *argv, VALUE beg)
4719 VALUE end, exclusive;
4721 rb_scan_args(argc, argv, "11", &end, &exclusive);
4722 RETURN_ENUMERATOR(beg, argc, argv);
4723 return rb_str_upto_each(beg, end, RTEST(exclusive), str_upto_i, Qnil);
4726 VALUE
4727 rb_str_upto_each(VALUE beg, VALUE end, int excl, int (*each)(VALUE, VALUE), VALUE arg)
4729 VALUE current, after_end;
4730 ID succ;
4731 int n, ascii;
4732 rb_encoding *enc;
4734 CONST_ID(succ, "succ");
4735 StringValue(end);
4736 enc = rb_enc_check(beg, end);
4737 ascii = (is_ascii_string(beg) && is_ascii_string(end));
4738 /* single character */
4739 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
4740 char c = RSTRING_PTR(beg)[0];
4741 char e = RSTRING_PTR(end)[0];
4743 if (c > e || (excl && c == e)) return beg;
4744 for (;;) {
4745 if ((*each)(rb_enc_str_new(&c, 1, enc), arg)) break;
4746 if (!excl && c == e) break;
4747 c++;
4748 if (excl && c == e) break;
4750 return beg;
4752 /* both edges are all digits */
4753 if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0]) &&
4754 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg)) &&
4755 all_digits_p(RSTRING_PTR(end), RSTRING_LEN(end))) {
4756 VALUE b, e;
4757 int width;
4759 width = RSTRING_LENINT(beg);
4760 b = rb_str_to_inum(beg, 10, FALSE);
4761 e = rb_str_to_inum(end, 10, FALSE);
4762 if (FIXNUM_P(b) && FIXNUM_P(e)) {
4763 long bi = FIX2LONG(b);
4764 long ei = FIX2LONG(e);
4765 rb_encoding *usascii = rb_usascii_encoding();
4767 while (bi <= ei) {
4768 if (excl && bi == ei) break;
4769 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
4770 bi++;
4773 else {
4774 ID op = excl ? '<' : idLE;
4775 VALUE args[2], fmt = rb_fstring_lit("%.*d");
4777 args[0] = INT2FIX(width);
4778 while (rb_funcall(b, op, 1, e)) {
4779 args[1] = b;
4780 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
4781 b = rb_funcallv(b, succ, 0, 0);
4784 return beg;
4786 /* normal case */
4787 n = rb_str_cmp(beg, end);
4788 if (n > 0 || (excl && n == 0)) return beg;
4790 after_end = rb_funcallv(end, succ, 0, 0);
4791 current = str_duplicate(rb_cString, beg);
4792 while (!rb_str_equal(current, after_end)) {
4793 VALUE next = Qnil;
4794 if (excl || !rb_str_equal(current, end))
4795 next = rb_funcallv(current, succ, 0, 0);
4796 if ((*each)(current, arg)) break;
4797 if (NIL_P(next)) break;
4798 current = next;
4799 StringValue(current);
4800 if (excl && rb_str_equal(current, end)) break;
4801 if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
4802 break;
4805 return beg;
4808 VALUE
4809 rb_str_upto_endless_each(VALUE beg, int (*each)(VALUE, VALUE), VALUE arg)
4811 VALUE current;
4812 ID succ;
4814 CONST_ID(succ, "succ");
4815 /* both edges are all digits */
4816 if (is_ascii_string(beg) && ISDIGIT(RSTRING_PTR(beg)[0]) &&
4817 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg))) {
4818 VALUE b, args[2], fmt = rb_fstring_lit("%.*d");
4819 int width = RSTRING_LENINT(beg);
4820 b = rb_str_to_inum(beg, 10, FALSE);
4821 if (FIXNUM_P(b)) {
4822 long bi = FIX2LONG(b);
4823 rb_encoding *usascii = rb_usascii_encoding();
4825 while (FIXABLE(bi)) {
4826 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
4827 bi++;
4829 b = LONG2NUM(bi);
4831 args[0] = INT2FIX(width);
4832 while (1) {
4833 args[1] = b;
4834 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
4835 b = rb_funcallv(b, succ, 0, 0);
4838 /* normal case */
4839 current = str_duplicate(rb_cString, beg);
4840 while (1) {
4841 VALUE next = rb_funcallv(current, succ, 0, 0);
4842 if ((*each)(current, arg)) break;
4843 current = next;
4844 StringValue(current);
4845 if (RSTRING_LEN(current) == 0)
4846 break;
4849 return beg;
4852 static int
4853 include_range_i(VALUE str, VALUE arg)
4855 VALUE *argp = (VALUE *)arg;
4856 if (!rb_equal(str, *argp)) return 0;
4857 *argp = Qnil;
4858 return 1;
4861 VALUE
4862 rb_str_include_range_p(VALUE beg, VALUE end, VALUE val, VALUE exclusive)
4864 beg = rb_str_new_frozen(beg);
4865 StringValue(end);
4866 end = rb_str_new_frozen(end);
4867 if (NIL_P(val)) return Qfalse;
4868 val = rb_check_string_type(val);
4869 if (NIL_P(val)) return Qfalse;
4870 if (rb_enc_asciicompat(STR_ENC_GET(beg)) &&
4871 rb_enc_asciicompat(STR_ENC_GET(end)) &&
4872 rb_enc_asciicompat(STR_ENC_GET(val))) {
4873 const char *bp = RSTRING_PTR(beg);
4874 const char *ep = RSTRING_PTR(end);
4875 const char *vp = RSTRING_PTR(val);
4876 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1) {
4877 if (RSTRING_LEN(val) == 0 || RSTRING_LEN(val) > 1)
4878 return Qfalse;
4879 else {
4880 char b = *bp;
4881 char e = *ep;
4882 char v = *vp;
4884 if (ISASCII(b) && ISASCII(e) && ISASCII(v)) {
4885 if (b <= v && v < e) return Qtrue;
4886 return RBOOL(!RTEST(exclusive) && v == e);
4890 #if 0
4891 /* both edges are all digits */
4892 if (ISDIGIT(*bp) && ISDIGIT(*ep) &&
4893 all_digits_p(bp, RSTRING_LEN(beg)) &&
4894 all_digits_p(ep, RSTRING_LEN(end))) {
4895 /* TODO */
4897 #endif
4899 rb_str_upto_each(beg, end, RTEST(exclusive), include_range_i, (VALUE)&val);
4901 return RBOOL(NIL_P(val));
4904 static VALUE
4905 rb_str_subpat(VALUE str, VALUE re, VALUE backref)
4907 if (rb_reg_search(re, str, 0, 0) >= 0) {
4908 VALUE match = rb_backref_get();
4909 int nth = rb_reg_backref_number(match, backref);
4910 return rb_reg_nth_match(nth, match);
4912 return Qnil;
4915 static VALUE
4916 rb_str_aref(VALUE str, VALUE indx)
4918 long idx;
4920 if (FIXNUM_P(indx)) {
4921 idx = FIX2LONG(indx);
4923 else if (RB_TYPE_P(indx, T_REGEXP)) {
4924 return rb_str_subpat(str, indx, INT2FIX(0));
4926 else if (RB_TYPE_P(indx, T_STRING)) {
4927 if (rb_str_index(str, indx, 0) != -1)
4928 return str_duplicate(rb_cString, indx);
4929 return Qnil;
4931 else {
4932 /* check if indx is Range */
4933 long beg, len = str_strlen(str, NULL);
4934 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
4935 case Qfalse:
4936 break;
4937 case Qnil:
4938 return Qnil;
4939 default:
4940 return rb_str_substr(str, beg, len);
4942 idx = NUM2LONG(indx);
4945 return str_substr(str, idx, 1, FALSE);
4950 * call-seq:
4951 * string[index] -> new_string or nil
4952 * string[start, length] -> new_string or nil
4953 * string[range] -> new_string or nil
4954 * string[regexp, capture = 0] -> new_string or nil
4955 * string[substring] -> new_string or nil
4957 * Returns the substring of +self+ specified by the arguments.
4959 * When the single \Integer argument +index+ is given,
4960 * returns the 1-character substring found in +self+ at offset +index+:
4962 * 'bar'[2] # => "r"
4964 * Counts backward from the end of +self+ if +index+ is negative:
4966 * 'foo'[-3] # => "f"
4968 * Returns +nil+ if +index+ is out of range:
4970 * 'foo'[3] # => nil
4971 * 'foo'[-4] # => nil
4973 * When the two \Integer arguments +start+ and +length+ are given,
4974 * returns the substring of the given +length+ found in +self+ at offset +start+:
4976 * 'foo'[0, 2] # => "fo"
4977 * 'foo'[0, 0] # => ""
4979 * Counts backward from the end of +self+ if +start+ is negative:
4981 * 'foo'[-2, 2] # => "oo"
4983 * Special case: returns a new empty \String if +start+ is equal to the length of +self+:
4985 * 'foo'[3, 2] # => ""
4987 * Returns +nil+ if +start+ is out of range:
4989 * 'foo'[4, 2] # => nil
4990 * 'foo'[-4, 2] # => nil
4992 * Returns the trailing substring of +self+ if +length+ is large:
4994 * 'foo'[1, 50] # => "oo"
4996 * Returns +nil+ if +length+ is negative:
4998 * 'foo'[0, -1] # => nil
5000 * When the single \Range argument +range+ is given,
5001 * derives +start+ and +length+ values from the given +range+,
5002 * and returns values as above:
5004 * - <tt>'foo'[0..1]</tt> is equivalent to <tt>'foo'[0, 2]</tt>.
5005 * - <tt>'foo'[0...1]</tt> is equivalent to <tt>'foo'[0, 1]</tt>.
5007 * When the \Regexp argument +regexp+ is given,
5008 * and the +capture+ argument is <tt>0</tt>,
5009 * returns the first matching substring found in +self+,
5010 * or +nil+ if none found:
5012 * 'foo'[/o/] # => "o"
5013 * 'foo'[/x/] # => nil
5014 * s = 'hello there'
5015 * s[/[aeiou](.)\1/] # => "ell"
5016 * s[/[aeiou](.)\1/, 0] # => "ell"
5018 * If argument +capture+ is given and not <tt>0</tt>,
5019 * it should be either an \Integer capture group index or a \String or \Symbol capture group name;
5020 * the method call returns only the specified capture
5021 * (see {Regexp Capturing}[Regexp.html#class-Regexp-label-Capturing]):
5023 * s = 'hello there'
5024 * s[/[aeiou](.)\1/, 1] # => "l"
5025 * s[/(?<vowel>[aeiou])(?<non_vowel>[^aeiou])/, "non_vowel"] # => "l"
5026 * s[/(?<vowel>[aeiou])(?<non_vowel>[^aeiou])/, :vowel] # => "e"
5028 * If an invalid capture group index is given, +nil+ is returned. If an invalid
5029 * capture group name is given, +IndexError+ is raised.
5031 * When the single \String argument +substring+ is given,
5032 * returns the substring from +self+ if found, otherwise +nil+:
5034 * 'foo'['oo'] # => "oo"
5035 * 'foo'['xx'] # => nil
5037 * String#slice is an alias for String#[].
5040 static VALUE
5041 rb_str_aref_m(int argc, VALUE *argv, VALUE str)
5043 if (argc == 2) {
5044 if (RB_TYPE_P(argv[0], T_REGEXP)) {
5045 return rb_str_subpat(str, argv[0], argv[1]);
5047 else {
5048 long beg = NUM2LONG(argv[0]);
5049 long len = NUM2LONG(argv[1]);
5050 return rb_str_substr(str, beg, len);
5053 rb_check_arity(argc, 1, 2);
5054 return rb_str_aref(str, argv[0]);
5057 VALUE
5058 rb_str_drop_bytes(VALUE str, long len)
5060 char *ptr = RSTRING_PTR(str);
5061 long olen = RSTRING_LEN(str), nlen;
5063 str_modifiable(str);
5064 if (len > olen) len = olen;
5065 nlen = olen - len;
5066 if (str_embed_capa(str) >= nlen + TERM_LEN(str)) {
5067 char *oldptr = ptr;
5068 int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|STR_SHARED|STR_NOFREE));
5069 STR_SET_EMBED(str);
5070 STR_SET_EMBED_LEN(str, nlen);
5071 ptr = RSTRING(str)->as.embed.ary;
5072 memmove(ptr, oldptr + len, nlen);
5073 if (fl == STR_NOEMBED) xfree(oldptr);
5075 else {
5076 if (!STR_SHARED_P(str)) {
5077 VALUE shared = heap_str_make_shared(rb_obj_class(str), str);
5078 rb_enc_cr_str_exact_copy(shared, str);
5079 OBJ_FREEZE(shared);
5081 ptr = RSTRING(str)->as.heap.ptr += len;
5082 RSTRING(str)->as.heap.len = nlen;
5084 ptr[nlen] = 0;
5085 ENC_CODERANGE_CLEAR(str);
5086 return str;
5089 static void
5090 rb_str_splice_0(VALUE str, long beg, long len, VALUE val)
5092 char *sptr;
5093 long slen, vlen = RSTRING_LEN(val);
5094 int cr;
5096 if (beg == 0 && vlen == 0) {
5097 rb_str_drop_bytes(str, len);
5098 return;
5101 str_modify_keep_cr(str);
5102 RSTRING_GETMEM(str, sptr, slen);
5103 if (len < vlen) {
5104 /* expand string */
5105 RESIZE_CAPA(str, slen + vlen - len);
5106 sptr = RSTRING_PTR(str);
5109 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT)
5110 cr = rb_enc_str_coderange(val);
5111 else
5112 cr = ENC_CODERANGE_UNKNOWN;
5114 if (vlen != len) {
5115 memmove(sptr + beg + vlen,
5116 sptr + beg + len,
5117 slen - (beg + len));
5119 if (vlen < beg && len < 0) {
5120 MEMZERO(sptr + slen, char, -len);
5122 if (vlen > 0) {
5123 memmove(sptr + beg, RSTRING_PTR(val), vlen);
5125 slen += vlen - len;
5126 STR_SET_LEN(str, slen);
5127 TERM_FILL(&sptr[slen], TERM_LEN(str));
5128 ENC_CODERANGE_SET(str, cr);
5131 void
5132 rb_str_update(VALUE str, long beg, long len, VALUE val)
5134 long slen;
5135 char *p, *e;
5136 rb_encoding *enc;
5137 int singlebyte = single_byte_optimizable(str);
5138 int cr;
5140 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
5142 StringValue(val);
5143 enc = rb_enc_check(str, val);
5144 slen = str_strlen(str, enc); /* rb_enc_check */
5146 if ((slen < beg) || ((beg < 0) && (beg + slen < 0))) {
5147 rb_raise(rb_eIndexError, "index %ld out of string", beg);
5149 if (beg < 0) {
5150 beg += slen;
5152 assert(beg >= 0);
5153 assert(beg <= slen);
5154 if (len > slen - beg) {
5155 len = slen - beg;
5157 str_modify_keep_cr(str);
5158 p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
5159 if (!p) p = RSTRING_END(str);
5160 e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
5161 if (!e) e = RSTRING_END(str);
5162 /* error check */
5163 beg = p - RSTRING_PTR(str); /* physical position */
5164 len = e - p; /* physical length */
5165 rb_str_splice_0(str, beg, len, val);
5166 rb_enc_associate(str, enc);
5167 cr = ENC_CODERANGE_AND(ENC_CODERANGE(str), ENC_CODERANGE(val));
5168 if (cr != ENC_CODERANGE_BROKEN)
5169 ENC_CODERANGE_SET(str, cr);
5172 #define rb_str_splice(str, beg, len, val) rb_str_update(str, beg, len, val)
5174 static void
5175 rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
5177 int nth;
5178 VALUE match;
5179 long start, end, len;
5180 rb_encoding *enc;
5181 struct re_registers *regs;
5183 if (rb_reg_search(re, str, 0, 0) < 0) {
5184 rb_raise(rb_eIndexError, "regexp not matched");
5186 match = rb_backref_get();
5187 nth = rb_reg_backref_number(match, backref);
5188 regs = RMATCH_REGS(match);
5189 if ((nth >= regs->num_regs) || ((nth < 0) && (-nth >= regs->num_regs))) {
5190 rb_raise(rb_eIndexError, "index %d out of regexp", nth);
5192 if (nth < 0) {
5193 nth += regs->num_regs;
5196 start = BEG(nth);
5197 if (start == -1) {
5198 rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
5200 end = END(nth);
5201 len = end - start;
5202 StringValue(val);
5203 enc = rb_enc_check_str(str, val);
5204 rb_str_splice_0(str, start, len, val);
5205 rb_enc_associate(str, enc);
5208 static VALUE
5209 rb_str_aset(VALUE str, VALUE indx, VALUE val)
5211 long idx, beg;
5213 switch (TYPE(indx)) {
5214 case T_REGEXP:
5215 rb_str_subpat_set(str, indx, INT2FIX(0), val);
5216 return val;
5218 case T_STRING:
5219 beg = rb_str_index(str, indx, 0);
5220 if (beg < 0) {
5221 rb_raise(rb_eIndexError, "string not matched");
5223 beg = rb_str_sublen(str, beg);
5224 rb_str_splice(str, beg, str_strlen(indx, NULL), val);
5225 return val;
5227 default:
5228 /* check if indx is Range */
5230 long beg, len;
5231 if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 2)) {
5232 rb_str_splice(str, beg, len, val);
5233 return val;
5236 /* FALLTHROUGH */
5238 case T_FIXNUM:
5239 idx = NUM2LONG(indx);
5240 rb_str_splice(str, idx, 1, val);
5241 return val;
5246 * call-seq:
5247 * str[integer] = new_str
5248 * str[integer, integer] = new_str
5249 * str[range] = aString
5250 * str[regexp] = new_str
5251 * str[regexp, integer] = new_str
5252 * str[regexp, name] = new_str
5253 * str[other_str] = new_str
5255 * Element Assignment---Replaces some or all of the content of
5256 * <i>str</i>. The portion of the string affected is determined using
5257 * the same criteria as String#[]. If the replacement string is not
5258 * the same length as the text it is replacing, the string will be
5259 * adjusted accordingly. If the regular expression or string is used
5260 * as the index doesn't match a position in the string, IndexError is
5261 * raised. If the regular expression form is used, the optional
5262 * second Integer allows you to specify which portion of the match to
5263 * replace (effectively using the MatchData indexing rules. The forms
5264 * that take an Integer will raise an IndexError if the value is out
5265 * of range; the Range form will raise a RangeError, and the Regexp
5266 * and String will raise an IndexError on negative match.
5269 static VALUE
5270 rb_str_aset_m(int argc, VALUE *argv, VALUE str)
5272 if (argc == 3) {
5273 if (RB_TYPE_P(argv[0], T_REGEXP)) {
5274 rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
5276 else {
5277 rb_str_splice(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
5279 return argv[2];
5281 rb_check_arity(argc, 2, 3);
5282 return rb_str_aset(str, argv[0], argv[1]);
5286 * call-seq:
5287 * insert(index, other_string) -> self
5289 * Inserts the given +other_string+ into +self+; returns +self+.
5291 * If the \Integer +index+ is positive, inserts +other_string+ at offset +index+:
5293 * 'foo'.insert(1, 'bar') # => "fbaroo"
5295 * If the \Integer +index+ is negative, counts backward from the end of +self+
5296 * and inserts +other_string+ at offset <tt>index+1</tt>
5297 * (that is, _after_ <tt>self[index]</tt>):
5299 * 'foo'.insert(-2, 'bar') # => "fobaro"
5303 static VALUE
5304 rb_str_insert(VALUE str, VALUE idx, VALUE str2)
5306 long pos = NUM2LONG(idx);
5308 if (pos == -1) {
5309 return rb_str_append(str, str2);
5311 else if (pos < 0) {
5312 pos++;
5314 rb_str_splice(str, pos, 0, str2);
5315 return str;
5320 * call-seq:
5321 * slice!(index) -> new_string or nil
5322 * slice!(start, length) -> new_string or nil
5323 * slice!(range) -> new_string or nil
5324 * slice!(regexp, capture = 0) -> new_string or nil
5325 * slice!(substring) -> new_string or nil
5327 * Removes the substring of +self+ specified by the arguments;
5328 * returns the removed substring.
5330 * See String#[] for details about the arguments that specify the substring.
5332 * A few examples:
5334 * string = "This is a string"
5335 * string.slice!(2) #=> "i"
5336 * string.slice!(3..6) #=> " is "
5337 * string.slice!(/s.*t/) #=> "sa st"
5338 * string.slice!("r") #=> "r"
5339 * string #=> "Thing"
5343 static VALUE
5344 rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
5346 VALUE result = Qnil;
5347 VALUE indx;
5348 long beg, len = 1;
5349 char *p;
5351 rb_check_arity(argc, 1, 2);
5352 str_modify_keep_cr(str);
5353 indx = argv[0];
5354 if (RB_TYPE_P(indx, T_REGEXP)) {
5355 if (rb_reg_search(indx, str, 0, 0) < 0) return Qnil;
5356 VALUE match = rb_backref_get();
5357 struct re_registers *regs = RMATCH_REGS(match);
5358 int nth = 0;
5359 if (argc > 1 && (nth = rb_reg_backref_number(match, argv[1])) < 0) {
5360 if ((nth += regs->num_regs) <= 0) return Qnil;
5362 else if (nth >= regs->num_regs) return Qnil;
5363 beg = BEG(nth);
5364 len = END(nth) - beg;
5365 goto subseq;
5367 else if (argc == 2) {
5368 beg = NUM2LONG(indx);
5369 len = NUM2LONG(argv[1]);
5370 goto num_index;
5372 else if (FIXNUM_P(indx)) {
5373 beg = FIX2LONG(indx);
5374 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
5375 if (!len) return Qnil;
5376 beg = p - RSTRING_PTR(str);
5377 goto subseq;
5379 else if (RB_TYPE_P(indx, T_STRING)) {
5380 beg = rb_str_index(str, indx, 0);
5381 if (beg == -1) return Qnil;
5382 len = RSTRING_LEN(indx);
5383 result = str_duplicate(rb_cString, indx);
5384 goto squash;
5386 else {
5387 switch (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 0)) {
5388 case Qnil:
5389 return Qnil;
5390 case Qfalse:
5391 beg = NUM2LONG(indx);
5392 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
5393 if (!len) return Qnil;
5394 beg = p - RSTRING_PTR(str);
5395 goto subseq;
5396 default:
5397 goto num_index;
5401 num_index:
5402 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
5403 beg = p - RSTRING_PTR(str);
5405 subseq:
5406 result = rb_str_new(RSTRING_PTR(str)+beg, len);
5407 rb_enc_cr_str_copy_for_substr(result, str);
5409 squash:
5410 if (len > 0) {
5411 if (beg == 0) {
5412 rb_str_drop_bytes(str, len);
5414 else {
5415 char *sptr = RSTRING_PTR(str);
5416 long slen = RSTRING_LEN(str);
5417 if (beg + len > slen) /* pathological check */
5418 len = slen - beg;
5419 memmove(sptr + beg,
5420 sptr + beg + len,
5421 slen - (beg + len));
5422 slen -= len;
5423 STR_SET_LEN(str, slen);
5424 TERM_FILL(&sptr[slen], TERM_LEN(str));
5427 return result;
5430 static VALUE
5431 get_pat(VALUE pat)
5433 VALUE val;
5435 switch (OBJ_BUILTIN_TYPE(pat)) {
5436 case T_REGEXP:
5437 return pat;
5439 case T_STRING:
5440 break;
5442 default:
5443 val = rb_check_string_type(pat);
5444 if (NIL_P(val)) {
5445 Check_Type(pat, T_REGEXP);
5447 pat = val;
5450 return rb_reg_regcomp(pat);
5453 static VALUE
5454 get_pat_quoted(VALUE pat, int check)
5456 VALUE val;
5458 switch (OBJ_BUILTIN_TYPE(pat)) {
5459 case T_REGEXP:
5460 return pat;
5462 case T_STRING:
5463 break;
5465 default:
5466 val = rb_check_string_type(pat);
5467 if (NIL_P(val)) {
5468 Check_Type(pat, T_REGEXP);
5470 pat = val;
5472 if (check && is_broken_string(pat)) {
5473 rb_exc_raise(rb_reg_check_preprocess(pat));
5475 return pat;
5478 static long
5479 rb_pat_search(VALUE pat, VALUE str, long pos, int set_backref_str)
5481 if (BUILTIN_TYPE(pat) == T_STRING) {
5482 pos = rb_strseq_index(str, pat, pos, 1);
5483 if (set_backref_str) {
5484 if (pos >= 0) {
5485 str = rb_str_new_frozen_String(str);
5486 rb_backref_set_string(str, pos, RSTRING_LEN(pat));
5488 else {
5489 rb_backref_set(Qnil);
5492 return pos;
5494 else {
5495 return rb_reg_search0(pat, str, pos, 0, set_backref_str);
5501 * call-seq:
5502 * sub!(pattern, replacement) -> self or nil
5503 * sub!(pattern) {|match| ... } -> self or nil
5505 * Returns +self+ with only the first occurrence
5506 * (not all occurrences) of the given +pattern+ replaced.
5508 * See {Substitution Methods}[#class-String-label-Substitution+Methods].
5510 * Related: String#sub, String#gsub, String#gsub!.
5514 static VALUE
5515 rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
5517 VALUE pat, repl, hash = Qnil;
5518 int iter = 0;
5519 long plen;
5520 int min_arity = rb_block_given_p() ? 1 : 2;
5521 long beg;
5523 rb_check_arity(argc, min_arity, 2);
5524 if (argc == 1) {
5525 iter = 1;
5527 else {
5528 repl = argv[1];
5529 hash = rb_check_hash_type(argv[1]);
5530 if (NIL_P(hash)) {
5531 StringValue(repl);
5535 pat = get_pat_quoted(argv[0], 1);
5537 str_modifiable(str);
5538 beg = rb_pat_search(pat, str, 0, 1);
5539 if (beg >= 0) {
5540 rb_encoding *enc;
5541 int cr = ENC_CODERANGE(str);
5542 long beg0, end0;
5543 VALUE match, match0 = Qnil;
5544 struct re_registers *regs;
5545 char *p, *rp;
5546 long len, rlen;
5548 match = rb_backref_get();
5549 regs = RMATCH_REGS(match);
5550 if (RB_TYPE_P(pat, T_STRING)) {
5551 beg0 = beg;
5552 end0 = beg0 + RSTRING_LEN(pat);
5553 match0 = pat;
5555 else {
5556 beg0 = BEG(0);
5557 end0 = END(0);
5558 if (iter) match0 = rb_reg_nth_match(0, match);
5561 if (iter || !NIL_P(hash)) {
5562 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
5564 if (iter) {
5565 repl = rb_obj_as_string(rb_yield(match0));
5567 else {
5568 repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
5569 repl = rb_obj_as_string(repl);
5571 str_mod_check(str, p, len);
5572 rb_check_frozen(str);
5574 else {
5575 repl = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
5578 enc = rb_enc_compatible(str, repl);
5579 if (!enc) {
5580 rb_encoding *str_enc = STR_ENC_GET(str);
5581 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
5582 if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
5583 coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
5584 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
5585 rb_enc_name(str_enc),
5586 rb_enc_name(STR_ENC_GET(repl)));
5588 enc = STR_ENC_GET(repl);
5590 rb_str_modify(str);
5591 rb_enc_associate(str, enc);
5592 if (ENC_CODERANGE_UNKNOWN < cr && cr < ENC_CODERANGE_BROKEN) {
5593 int cr2 = ENC_CODERANGE(repl);
5594 if (cr2 == ENC_CODERANGE_BROKEN ||
5595 (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
5596 cr = ENC_CODERANGE_UNKNOWN;
5597 else
5598 cr = cr2;
5600 plen = end0 - beg0;
5601 rlen = RSTRING_LEN(repl);
5602 len = RSTRING_LEN(str);
5603 if (rlen > plen) {
5604 RESIZE_CAPA(str, len + rlen - plen);
5606 p = RSTRING_PTR(str);
5607 if (rlen != plen) {
5608 memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
5610 rp = RSTRING_PTR(repl);
5611 memmove(p + beg0, rp, rlen);
5612 len += rlen - plen;
5613 STR_SET_LEN(str, len);
5614 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
5615 ENC_CODERANGE_SET(str, cr);
5617 return str;
5619 return Qnil;
5624 * call-seq:
5625 * sub(pattern, replacement) -> new_string
5626 * sub(pattern) {|match| ... } -> new_string
5628 * Returns a copy of +self+ with only the first occurrence
5629 * (not all occurrences) of the given +pattern+ replaced.
5631 * See {Substitution Methods}[#class-String-label-Substitution+Methods].
5633 * Related: String#sub!, String#gsub, String#gsub!.
5637 static VALUE
5638 rb_str_sub(int argc, VALUE *argv, VALUE str)
5640 str = str_duplicate(rb_cString, str);
5641 rb_str_sub_bang(argc, argv, str);
5642 return str;
5645 static VALUE
5646 str_gsub(int argc, VALUE *argv, VALUE str, int bang)
5648 VALUE pat, val = Qnil, repl, match, match0 = Qnil, dest, hash = Qnil;
5649 struct re_registers *regs;
5650 long beg, beg0, end0;
5651 long offset, blen, slen, len, last;
5652 enum {STR, ITER, MAP} mode = STR;
5653 char *sp, *cp;
5654 int need_backref = -1;
5655 rb_encoding *str_enc;
5657 switch (argc) {
5658 case 1:
5659 RETURN_ENUMERATOR(str, argc, argv);
5660 mode = ITER;
5661 break;
5662 case 2:
5663 repl = argv[1];
5664 hash = rb_check_hash_type(argv[1]);
5665 if (NIL_P(hash)) {
5666 StringValue(repl);
5668 else {
5669 mode = MAP;
5671 break;
5672 default:
5673 rb_error_arity(argc, 1, 2);
5676 pat = get_pat_quoted(argv[0], 1);
5677 beg = rb_pat_search(pat, str, 0, need_backref);
5678 if (beg < 0) {
5679 if (bang) return Qnil; /* no match, no substitution */
5680 return str_duplicate(rb_cString, str);
5683 offset = 0;
5684 blen = RSTRING_LEN(str) + 30; /* len + margin */
5685 dest = rb_str_buf_new(blen);
5686 sp = RSTRING_PTR(str);
5687 slen = RSTRING_LEN(str);
5688 cp = sp;
5689 str_enc = STR_ENC_GET(str);
5690 rb_enc_associate(dest, str_enc);
5691 ENC_CODERANGE_SET(dest, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
5693 do {
5694 match = rb_backref_get();
5695 regs = RMATCH_REGS(match);
5696 if (RB_TYPE_P(pat, T_STRING)) {
5697 beg0 = beg;
5698 end0 = beg0 + RSTRING_LEN(pat);
5699 match0 = pat;
5701 else {
5702 beg0 = BEG(0);
5703 end0 = END(0);
5704 if (mode == ITER) match0 = rb_reg_nth_match(0, match);
5707 if (mode) {
5708 if (mode == ITER) {
5709 val = rb_obj_as_string(rb_yield(match0));
5711 else {
5712 val = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
5713 val = rb_obj_as_string(val);
5715 str_mod_check(str, sp, slen);
5716 if (val == dest) { /* paranoid check [ruby-dev:24827] */
5717 rb_raise(rb_eRuntimeError, "block should not cheat");
5720 else if (need_backref) {
5721 val = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
5722 if (need_backref < 0) {
5723 need_backref = val != repl;
5726 else {
5727 val = repl;
5730 len = beg0 - offset; /* copy pre-match substr */
5731 if (len) {
5732 rb_enc_str_buf_cat(dest, cp, len, str_enc);
5735 rb_str_buf_append(dest, val);
5737 last = offset;
5738 offset = end0;
5739 if (beg0 == end0) {
5741 * Always consume at least one character of the input string
5742 * in order to prevent infinite loops.
5744 if (RSTRING_LEN(str) <= end0) break;
5745 len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
5746 rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
5747 offset = end0 + len;
5749 cp = RSTRING_PTR(str) + offset;
5750 if (offset > RSTRING_LEN(str)) break;
5751 beg = rb_pat_search(pat, str, offset, need_backref);
5752 } while (beg >= 0);
5753 if (RSTRING_LEN(str) > offset) {
5754 rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
5756 rb_pat_search(pat, str, last, 1);
5757 if (bang) {
5758 str_shared_replace(str, dest);
5760 else {
5761 str = dest;
5764 return str;
5769 * call-seq:
5770 * gsub!(pattern, replacement) -> self or nil
5771 * gsub!(pattern) {|match| ... } -> self or nil
5772 * gsub!(pattern) -> an_enumerator
5774 * Performs the specified substring replacement(s) on +self+;
5775 * returns +self+ if any replacement occurred, +nil+ otherwise.
5777 * See {Substitution Methods}[#class-String-label-Substitution+Methods].
5779 * Returns an Enumerator if no +replacement+ and no block given.
5781 * Related: String#sub, String#gsub, String#sub!.
5785 static VALUE
5786 rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
5788 str_modify_keep_cr(str);
5789 return str_gsub(argc, argv, str, 1);
5794 * call-seq:
5795 * gsub(pattern, replacement) -> new_string
5796 * gsub(pattern) {|match| ... } -> new_string
5797 * gsub(pattern) -> enumerator
5799 * Returns a copy of +self+ with all occurrences of the given +pattern+ replaced.
5801 * See {Substitution Methods}[#class-String-label-Substitution+Methods].
5803 * Returns an Enumerator if no +replacement+ and no block given.
5805 * Related: String#sub, String#sub!, String#gsub!.
5809 static VALUE
5810 rb_str_gsub(int argc, VALUE *argv, VALUE str)
5812 return str_gsub(argc, argv, str, 0);
5817 * call-seq:
5818 * replace(other_string) -> self
5820 * Replaces the contents of +self+ with the contents of +other_string+:
5822 * s = 'foo' # => "foo"
5823 * s.replace('bar') # => "bar"
5827 VALUE
5828 rb_str_replace(VALUE str, VALUE str2)
5830 str_modifiable(str);
5831 if (str == str2) return str;
5833 StringValue(str2);
5834 str_discard(str);
5835 return str_replace(str, str2);
5839 * call-seq:
5840 * clear -> self
5842 * Removes the contents of +self+:
5844 * s = 'foo' # => "foo"
5845 * s.clear # => ""
5849 static VALUE
5850 rb_str_clear(VALUE str)
5852 str_discard(str);
5853 STR_SET_EMBED(str);
5854 STR_SET_EMBED_LEN(str, 0);
5855 RSTRING_PTR(str)[0] = 0;
5856 if (rb_enc_asciicompat(STR_ENC_GET(str)))
5857 ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
5858 else
5859 ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
5860 return str;
5864 * call-seq:
5865 * chr -> string
5867 * Returns a string containing the first character of +self+:
5869 * s = 'foo' # => "foo"
5870 * s.chr # => "f"
5874 static VALUE
5875 rb_str_chr(VALUE str)
5877 return rb_str_substr(str, 0, 1);
5881 * call-seq:
5882 * getbyte(index) -> integer
5884 * Returns the byte at zero-based +index+ as an integer:
5886 * s = 'abcde' # => "abcde"
5887 * s.getbyte(0) # => 97
5888 * s.getbyte(1) # => 98
5890 * Related: String#setbyte.
5892 static VALUE
5893 rb_str_getbyte(VALUE str, VALUE index)
5895 long pos = NUM2LONG(index);
5897 if (pos < 0)
5898 pos += RSTRING_LEN(str);
5899 if (pos < 0 || RSTRING_LEN(str) <= pos)
5900 return Qnil;
5902 return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
5906 * call-seq:
5907 * setbyte(index, integer) -> integer
5909 * Sets the byte at zero-based +index+ to +integer+; returns +integer+:
5911 * s = 'abcde' # => "abcde"
5912 * s.setbyte(0, 98) # => 98
5913 * s # => "bbcde"
5915 * Related: String#getbyte.
5917 static VALUE
5918 rb_str_setbyte(VALUE str, VALUE index, VALUE value)
5920 long pos = NUM2LONG(index);
5921 long len = RSTRING_LEN(str);
5922 char *ptr, *head, *left = 0;
5923 rb_encoding *enc;
5924 int cr = ENC_CODERANGE_UNKNOWN, width, nlen;
5926 if (pos < -len || len <= pos)
5927 rb_raise(rb_eIndexError, "index %ld out of string", pos);
5928 if (pos < 0)
5929 pos += len;
5931 VALUE v = rb_to_int(value);
5932 VALUE w = rb_int_and(v, INT2FIX(0xff));
5933 char byte = (char)(NUM2INT(w) & 0xFF);
5935 if (!str_independent(str))
5936 str_make_independent(str);
5937 enc = STR_ENC_GET(str);
5938 head = RSTRING_PTR(str);
5939 ptr = &head[pos];
5940 if (!STR_EMBED_P(str)) {
5941 cr = ENC_CODERANGE(str);
5942 switch (cr) {
5943 case ENC_CODERANGE_7BIT:
5944 left = ptr;
5945 *ptr = byte;
5946 if (ISASCII(byte)) goto end;
5947 nlen = rb_enc_precise_mbclen(left, head+len, enc);
5948 if (!MBCLEN_CHARFOUND_P(nlen))
5949 ENC_CODERANGE_SET(str, ENC_CODERANGE_BROKEN);
5950 else
5951 ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
5952 goto end;
5953 case ENC_CODERANGE_VALID:
5954 left = rb_enc_left_char_head(head, ptr, head+len, enc);
5955 width = rb_enc_precise_mbclen(left, head+len, enc);
5956 *ptr = byte;
5957 nlen = rb_enc_precise_mbclen(left, head+len, enc);
5958 if (!MBCLEN_CHARFOUND_P(nlen))
5959 ENC_CODERANGE_SET(str, ENC_CODERANGE_BROKEN);
5960 else if (MBCLEN_CHARFOUND_LEN(nlen) != width || ISASCII(byte))
5961 ENC_CODERANGE_CLEAR(str);
5962 goto end;
5965 ENC_CODERANGE_CLEAR(str);
5966 *ptr = byte;
5968 end:
5969 return value;
5972 static VALUE
5973 str_byte_substr(VALUE str, long beg, long len, int empty)
5975 char *p, *s = RSTRING_PTR(str);
5976 long n = RSTRING_LEN(str);
5977 VALUE str2;
5979 if (beg > n || len < 0) return Qnil;
5980 if (beg < 0) {
5981 beg += n;
5982 if (beg < 0) return Qnil;
5984 if (len > n - beg)
5985 len = n - beg;
5986 if (len <= 0) {
5987 if (!empty) return Qnil;
5988 len = 0;
5989 p = 0;
5991 else
5992 p = s + beg;
5994 if (!STR_EMBEDDABLE_P(len, TERM_LEN(str)) && SHARABLE_SUBSTRING_P(beg, len, n)) {
5995 str2 = rb_str_new_frozen(str);
5996 str2 = str_new_shared(rb_cString, str2);
5997 RSTRING(str2)->as.heap.ptr += beg;
5998 RSTRING(str2)->as.heap.len = len;
6000 else {
6001 str2 = rb_str_new(p, len);
6004 str_enc_copy(str2, str);
6006 if (RSTRING_LEN(str2) == 0) {
6007 if (!rb_enc_asciicompat(STR_ENC_GET(str)))
6008 ENC_CODERANGE_SET(str2, ENC_CODERANGE_VALID);
6009 else
6010 ENC_CODERANGE_SET(str2, ENC_CODERANGE_7BIT);
6012 else {
6013 switch (ENC_CODERANGE(str)) {
6014 case ENC_CODERANGE_7BIT:
6015 ENC_CODERANGE_SET(str2, ENC_CODERANGE_7BIT);
6016 break;
6017 default:
6018 ENC_CODERANGE_SET(str2, ENC_CODERANGE_UNKNOWN);
6019 break;
6023 return str2;
6026 static VALUE
6027 str_byte_aref(VALUE str, VALUE indx)
6029 long idx;
6030 if (FIXNUM_P(indx)) {
6031 idx = FIX2LONG(indx);
6033 else {
6034 /* check if indx is Range */
6035 long beg, len = RSTRING_LEN(str);
6037 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
6038 case Qfalse:
6039 break;
6040 case Qnil:
6041 return Qnil;
6042 default:
6043 return str_byte_substr(str, beg, len, TRUE);
6046 idx = NUM2LONG(indx);
6048 return str_byte_substr(str, idx, 1, FALSE);
6052 * call-seq:
6053 * byteslice(index, length = 1) -> string or nil
6054 * byteslice(range) -> string or nil
6056 * Returns a substring of +self+, or +nil+ if the substring cannot be constructed.
6058 * With integer arguments +index+ and +length+ given,
6059 * returns the substring beginning at the given +index+
6060 * of the given +length+ (if possible),
6061 * or +nil+ if +length+ is negative or +index+ falls outside of +self+:
6063 * s = '0123456789' # => "0123456789"
6064 * s.byteslice(2) # => "2"
6065 * s.byteslice(200) # => nil
6066 * s.byteslice(4, 3) # => "456"
6067 * s.byteslice(4, 30) # => "456789"
6068 * s.byteslice(4, -1) # => nil
6069 * s.byteslice(40, 2) # => nil
6071 * In either case above, counts backwards from the end of +self+
6072 * if +index+ is negative:
6074 * s = '0123456789' # => "0123456789"
6075 * s.byteslice(-4) # => "6"
6076 * s.byteslice(-4, 3) # => "678"
6078 * With Range argument +range+ given, returns
6079 * <tt>byteslice(range.begin, range.size)</tt>:
6081 * s = '0123456789' # => "0123456789"
6082 * s.byteslice(4..6) # => "456"
6083 * s.byteslice(-6..-4) # => "456"
6084 * s.byteslice(5..2) # => "" # range.size is zero.
6085 * s.byteslice(40..42) # => nil
6087 * In all cases, a returned string has the same encoding as +self+:
6089 * s.encoding # => #<Encoding:UTF-8>
6090 * s.byteslice(4).encoding # => #<Encoding:UTF-8>
6094 static VALUE
6095 rb_str_byteslice(int argc, VALUE *argv, VALUE str)
6097 if (argc == 2) {
6098 long beg = NUM2LONG(argv[0]);
6099 long end = NUM2LONG(argv[1]);
6100 return str_byte_substr(str, beg, end, TRUE);
6102 rb_check_arity(argc, 1, 2);
6103 return str_byte_aref(str, argv[0]);
6107 * call-seq:
6108 * reverse -> string
6110 * Returns a new string with the characters from +self+ in reverse order.
6112 * 'stressed'.reverse # => "desserts"
6116 static VALUE
6117 rb_str_reverse(VALUE str)
6119 rb_encoding *enc;
6120 VALUE rev;
6121 char *s, *e, *p;
6122 int cr;
6124 if (RSTRING_LEN(str) <= 1) return str_duplicate(rb_cString, str);
6125 enc = STR_ENC_GET(str);
6126 rev = rb_str_new(0, RSTRING_LEN(str));
6127 s = RSTRING_PTR(str); e = RSTRING_END(str);
6128 p = RSTRING_END(rev);
6129 cr = ENC_CODERANGE(str);
6131 if (RSTRING_LEN(str) > 1) {
6132 if (single_byte_optimizable(str)) {
6133 while (s < e) {
6134 *--p = *s++;
6137 else if (cr == ENC_CODERANGE_VALID) {
6138 while (s < e) {
6139 int clen = rb_enc_fast_mbclen(s, e, enc);
6141 p -= clen;
6142 memcpy(p, s, clen);
6143 s += clen;
6146 else {
6147 cr = rb_enc_asciicompat(enc) ?
6148 ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
6149 while (s < e) {
6150 int clen = rb_enc_mbclen(s, e, enc);
6152 if (clen > 1 || (*s & 0x80)) cr = ENC_CODERANGE_UNKNOWN;
6153 p -= clen;
6154 memcpy(p, s, clen);
6155 s += clen;
6159 STR_SET_LEN(rev, RSTRING_LEN(str));
6160 str_enc_copy(rev, str);
6161 ENC_CODERANGE_SET(rev, cr);
6163 return rev;
6168 * call-seq:
6169 * reverse! -> self
6171 * Returns +self+ with its characters reversed:
6173 * s = 'stressed'
6174 * s.reverse! # => "desserts"
6175 * s # => "desserts"
6179 static VALUE
6180 rb_str_reverse_bang(VALUE str)
6182 if (RSTRING_LEN(str) > 1) {
6183 if (single_byte_optimizable(str)) {
6184 char *s, *e, c;
6186 str_modify_keep_cr(str);
6187 s = RSTRING_PTR(str);
6188 e = RSTRING_END(str) - 1;
6189 while (s < e) {
6190 c = *s;
6191 *s++ = *e;
6192 *e-- = c;
6195 else {
6196 str_shared_replace(str, rb_str_reverse(str));
6199 else {
6200 str_modify_keep_cr(str);
6202 return str;
6207 * call-seq:
6208 * include? other_string -> true or false
6210 * Returns +true+ if +self+ contains +other_string+, +false+ otherwise:
6212 * s = 'foo'
6213 * s.include?('f') # => true
6214 * s.include?('fo') # => true
6215 * s.include?('food') # => false
6219 static VALUE
6220 rb_str_include(VALUE str, VALUE arg)
6222 long i;
6224 StringValue(arg);
6225 i = rb_str_index(str, arg, 0);
6227 return RBOOL(i != -1);
6232 * call-seq:
6233 * to_i(base = 10) -> integer
6235 * Returns the result of interpreting leading characters in +self+
6236 * as an integer in the given +base+ (which must be in (2..36)):
6238 * '123456'.to_i # => 123456
6239 * '123def'.to_i(16) # => 1195503
6241 * Characters past a leading valid number (in the given +base+) are ignored:
6243 * '12.345'.to_i # => 12
6244 * '12345'.to_i(2) # => 1
6246 * Returns zero if there is no leading valid number:
6248 * 'abcdef'.to_i # => 0
6249 * '2'.to_i(2) # => 0
6253 static VALUE
6254 rb_str_to_i(int argc, VALUE *argv, VALUE str)
6256 int base = 10;
6258 if (rb_check_arity(argc, 0, 1) && (base = NUM2INT(argv[0])) < 0) {
6259 rb_raise(rb_eArgError, "invalid radix %d", base);
6261 return rb_str_to_inum(str, base, FALSE);
6266 * call-seq:
6267 * to_f -> float
6269 * Returns the result of interpreting leading characters in +self+ as a Float:
6271 * '3.14159'.to_f # => 3.14159
6272 '1.234e-2'.to_f # => 0.01234
6274 * Characters past a leading valid number (in the given +base+) are ignored:
6276 * '3.14 (pi to two places)'.to_f # => 3.14
6278 * Returns zero if there is no leading valid number:
6280 * 'abcdef'.to_f # => 0.0
6284 static VALUE
6285 rb_str_to_f(VALUE str)
6287 return DBL2NUM(rb_str_to_dbl(str, FALSE));
6292 * call-seq:
6293 * to_s -> self or string
6295 * Returns +self+ if +self+ is a \String,
6296 * or +self+ converted to a \String if +self+ is a subclass of \String.
6298 * String#to_str is an alias for String#to_s.
6302 static VALUE
6303 rb_str_to_s(VALUE str)
6305 if (rb_obj_class(str) != rb_cString) {
6306 return str_duplicate(rb_cString, str);
6308 return str;
6311 #if 0
6312 static void
6313 str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
6315 char s[RUBY_MAX_CHAR_LEN];
6316 int n = rb_enc_codelen(c, enc);
6318 rb_enc_mbcput(c, s, enc);
6319 rb_enc_str_buf_cat(str, s, n, enc);
6321 #endif
6323 #define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */
6326 rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
6328 char buf[CHAR_ESC_LEN + 1];
6329 int l;
6331 #if SIZEOF_INT > 4
6332 c &= 0xffffffff;
6333 #endif
6334 if (unicode_p) {
6335 if (c < 0x7F && ISPRINT(c)) {
6336 snprintf(buf, CHAR_ESC_LEN, "%c", c);
6338 else if (c < 0x10000) {
6339 snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
6341 else {
6342 snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
6345 else {
6346 if (c < 0x100) {
6347 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
6349 else {
6350 snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
6353 l = (int)strlen(buf); /* CHAR_ESC_LEN cannot exceed INT_MAX */
6354 rb_str_buf_cat(result, buf, l);
6355 return l;
6358 const char *
6359 ruby_escaped_char(int c)
6361 switch (c) {
6362 case '\0': return "\\0";
6363 case '\n': return "\\n";
6364 case '\r': return "\\r";
6365 case '\t': return "\\t";
6366 case '\f': return "\\f";
6367 case '\013': return "\\v";
6368 case '\010': return "\\b";
6369 case '\007': return "\\a";
6370 case '\033': return "\\e";
6371 case '\x7f': return "\\c?";
6373 return NULL;
6376 VALUE
6377 rb_str_escape(VALUE str)
6379 int encidx = ENCODING_GET(str);
6380 rb_encoding *enc = rb_enc_from_index(encidx);
6381 const char *p = RSTRING_PTR(str);
6382 const char *pend = RSTRING_END(str);
6383 const char *prev = p;
6384 char buf[CHAR_ESC_LEN + 1];
6385 VALUE result = rb_str_buf_new(0);
6386 int unicode_p = rb_enc_unicode_p(enc);
6387 int asciicompat = rb_enc_asciicompat(enc);
6389 while (p < pend) {
6390 unsigned int c;
6391 const char *cc;
6392 int n = rb_enc_precise_mbclen(p, pend, enc);
6393 if (!MBCLEN_CHARFOUND_P(n)) {
6394 if (p > prev) str_buf_cat(result, prev, p - prev);
6395 n = rb_enc_mbminlen(enc);
6396 if (pend < p + n)
6397 n = (int)(pend - p);
6398 while (n--) {
6399 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
6400 str_buf_cat(result, buf, strlen(buf));
6401 prev = ++p;
6403 continue;
6405 n = MBCLEN_CHARFOUND_LEN(n);
6406 c = rb_enc_mbc_to_codepoint(p, pend, enc);
6407 p += n;
6408 cc = ruby_escaped_char(c);
6409 if (cc) {
6410 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6411 str_buf_cat(result, cc, strlen(cc));
6412 prev = p;
6414 else if (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c)) {
6416 else {
6417 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6418 rb_str_buf_cat_escaped_char(result, c, unicode_p);
6419 prev = p;
6422 if (p > prev) str_buf_cat(result, prev, p - prev);
6423 ENCODING_CODERANGE_SET(result, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
6425 return result;
6429 * call-seq:
6430 * inspect -> string
6432 * Returns a printable version of +self+, enclosed in double-quotes,
6433 * and with special characters escaped:
6435 * s = "foo\tbar\tbaz\n"
6436 * # => "foo\tbar\tbaz\n"
6437 * s.inspect
6438 * # => "\"foo\\tbar\\tbaz\\n\""
6442 VALUE
6443 rb_str_inspect(VALUE str)
6445 int encidx = ENCODING_GET(str);
6446 rb_encoding *enc = rb_enc_from_index(encidx), *actenc;
6447 const char *p, *pend, *prev;
6448 char buf[CHAR_ESC_LEN + 1];
6449 VALUE result = rb_str_buf_new(0);
6450 rb_encoding *resenc = rb_default_internal_encoding();
6451 int unicode_p = rb_enc_unicode_p(enc);
6452 int asciicompat = rb_enc_asciicompat(enc);
6454 if (resenc == NULL) resenc = rb_default_external_encoding();
6455 if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
6456 rb_enc_associate(result, resenc);
6457 str_buf_cat2(result, "\"");
6459 p = RSTRING_PTR(str); pend = RSTRING_END(str);
6460 prev = p;
6461 actenc = get_actual_encoding(encidx, str);
6462 if (actenc != enc) {
6463 enc = actenc;
6464 if (unicode_p) unicode_p = rb_enc_unicode_p(enc);
6466 while (p < pend) {
6467 unsigned int c, cc;
6468 int n;
6470 n = rb_enc_precise_mbclen(p, pend, enc);
6471 if (!MBCLEN_CHARFOUND_P(n)) {
6472 if (p > prev) str_buf_cat(result, prev, p - prev);
6473 n = rb_enc_mbminlen(enc);
6474 if (pend < p + n)
6475 n = (int)(pend - p);
6476 while (n--) {
6477 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
6478 str_buf_cat(result, buf, strlen(buf));
6479 prev = ++p;
6481 continue;
6483 n = MBCLEN_CHARFOUND_LEN(n);
6484 c = rb_enc_mbc_to_codepoint(p, pend, enc);
6485 p += n;
6486 if ((asciicompat || unicode_p) &&
6487 (c == '"'|| c == '\\' ||
6488 (c == '#' &&
6489 p < pend &&
6490 MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p,pend,enc)) &&
6491 (cc = rb_enc_codepoint(p,pend,enc),
6492 (cc == '$' || cc == '@' || cc == '{'))))) {
6493 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6494 str_buf_cat2(result, "\\");
6495 if (asciicompat || enc == resenc) {
6496 prev = p - n;
6497 continue;
6500 switch (c) {
6501 case '\n': cc = 'n'; break;
6502 case '\r': cc = 'r'; break;
6503 case '\t': cc = 't'; break;
6504 case '\f': cc = 'f'; break;
6505 case '\013': cc = 'v'; break;
6506 case '\010': cc = 'b'; break;
6507 case '\007': cc = 'a'; break;
6508 case 033: cc = 'e'; break;
6509 default: cc = 0; break;
6511 if (cc) {
6512 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6513 buf[0] = '\\';
6514 buf[1] = (char)cc;
6515 str_buf_cat(result, buf, 2);
6516 prev = p;
6517 continue;
6519 if ((enc == resenc && rb_enc_isprint(c, enc)) ||
6520 (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
6521 continue;
6523 else {
6524 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6525 rb_str_buf_cat_escaped_char(result, c, unicode_p);
6526 prev = p;
6527 continue;
6530 if (p > prev) str_buf_cat(result, prev, p - prev);
6531 str_buf_cat2(result, "\"");
6533 return result;
6536 #define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
6539 * call-seq:
6540 * dump -> string
6542 * Returns a printable version of +self+, enclosed in double-quotes,
6543 * with special characters escaped, and with non-printing characters
6544 * replaced by hexadecimal notation:
6546 * "hello \n ''".dump # => "\"hello \\n ''\""
6547 * "\f\x00\xff\\\"".dump # => "\"\\f\\x00\\xFF\\\\\\\"\""
6549 * Related: String#undump (inverse of String#dump).
6553 VALUE
6554 rb_str_dump(VALUE str)
6556 int encidx = rb_enc_get_index(str);
6557 rb_encoding *enc = rb_enc_from_index(encidx);
6558 long len;
6559 const char *p, *pend;
6560 char *q, *qend;
6561 VALUE result;
6562 int u8 = (encidx == rb_utf8_encindex());
6563 static const char nonascii_suffix[] = ".dup.force_encoding(\"%s\")";
6565 len = 2; /* "" */
6566 if (!rb_enc_asciicompat(enc)) {
6567 len += strlen(nonascii_suffix) - rb_strlen_lit("%s");
6568 len += strlen(enc->name);
6571 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
6572 while (p < pend) {
6573 int clen;
6574 unsigned char c = *p++;
6576 switch (c) {
6577 case '"': case '\\':
6578 case '\n': case '\r':
6579 case '\t': case '\f':
6580 case '\013': case '\010': case '\007': case '\033':
6581 clen = 2;
6582 break;
6584 case '#':
6585 clen = IS_EVSTR(p, pend) ? 2 : 1;
6586 break;
6588 default:
6589 if (ISPRINT(c)) {
6590 clen = 1;
6592 else {
6593 if (u8 && c > 0x7F) { /* \u notation */
6594 int n = rb_enc_precise_mbclen(p-1, pend, enc);
6595 if (MBCLEN_CHARFOUND_P(n)) {
6596 unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
6597 if (cc <= 0xFFFF)
6598 clen = 6; /* \uXXXX */
6599 else if (cc <= 0xFFFFF)
6600 clen = 9; /* \u{XXXXX} */
6601 else
6602 clen = 10; /* \u{XXXXXX} */
6603 p += MBCLEN_CHARFOUND_LEN(n)-1;
6604 break;
6607 clen = 4; /* \xNN */
6609 break;
6612 if (clen > LONG_MAX - len) {
6613 rb_raise(rb_eRuntimeError, "string size too big");
6615 len += clen;
6618 result = rb_str_new(0, len);
6619 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
6620 q = RSTRING_PTR(result); qend = q + len + 1;
6622 *q++ = '"';
6623 while (p < pend) {
6624 unsigned char c = *p++;
6626 if (c == '"' || c == '\\') {
6627 *q++ = '\\';
6628 *q++ = c;
6630 else if (c == '#') {
6631 if (IS_EVSTR(p, pend)) *q++ = '\\';
6632 *q++ = '#';
6634 else if (c == '\n') {
6635 *q++ = '\\';
6636 *q++ = 'n';
6638 else if (c == '\r') {
6639 *q++ = '\\';
6640 *q++ = 'r';
6642 else if (c == '\t') {
6643 *q++ = '\\';
6644 *q++ = 't';
6646 else if (c == '\f') {
6647 *q++ = '\\';
6648 *q++ = 'f';
6650 else if (c == '\013') {
6651 *q++ = '\\';
6652 *q++ = 'v';
6654 else if (c == '\010') {
6655 *q++ = '\\';
6656 *q++ = 'b';
6658 else if (c == '\007') {
6659 *q++ = '\\';
6660 *q++ = 'a';
6662 else if (c == '\033') {
6663 *q++ = '\\';
6664 *q++ = 'e';
6666 else if (ISPRINT(c)) {
6667 *q++ = c;
6669 else {
6670 *q++ = '\\';
6671 if (u8) {
6672 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
6673 if (MBCLEN_CHARFOUND_P(n)) {
6674 int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
6675 p += n;
6676 if (cc <= 0xFFFF)
6677 snprintf(q, qend-q, "u%04X", cc); /* \uXXXX */
6678 else
6679 snprintf(q, qend-q, "u{%X}", cc); /* \u{XXXXX} or \u{XXXXXX} */
6680 q += strlen(q);
6681 continue;
6684 snprintf(q, qend-q, "x%02X", c);
6685 q += 3;
6688 *q++ = '"';
6689 *q = '\0';
6690 if (!rb_enc_asciicompat(enc)) {
6691 snprintf(q, qend-q, nonascii_suffix, enc->name);
6692 encidx = rb_ascii8bit_encindex();
6694 /* result from dump is ASCII */
6695 rb_enc_associate_index(result, encidx);
6696 ENC_CODERANGE_SET(result, ENC_CODERANGE_7BIT);
6697 return result;
6700 static int
6701 unescape_ascii(unsigned int c)
6703 switch (c) {
6704 case 'n':
6705 return '\n';
6706 case 'r':
6707 return '\r';
6708 case 't':
6709 return '\t';
6710 case 'f':
6711 return '\f';
6712 case 'v':
6713 return '\13';
6714 case 'b':
6715 return '\010';
6716 case 'a':
6717 return '\007';
6718 case 'e':
6719 return 033;
6721 UNREACHABLE_RETURN(-1);
6724 static void
6725 undump_after_backslash(VALUE undumped, const char **ss, const char *s_end, rb_encoding **penc, bool *utf8, bool *binary)
6727 const char *s = *ss;
6728 unsigned int c;
6729 int codelen;
6730 size_t hexlen;
6731 unsigned char buf[6];
6732 static rb_encoding *enc_utf8 = NULL;
6734 switch (*s) {
6735 case '\\':
6736 case '"':
6737 case '#':
6738 rb_str_cat(undumped, s, 1); /* cat itself */
6739 s++;
6740 break;
6741 case 'n':
6742 case 'r':
6743 case 't':
6744 case 'f':
6745 case 'v':
6746 case 'b':
6747 case 'a':
6748 case 'e':
6749 *buf = unescape_ascii(*s);
6750 rb_str_cat(undumped, (char *)buf, 1);
6751 s++;
6752 break;
6753 case 'u':
6754 if (*binary) {
6755 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
6757 *utf8 = true;
6758 if (++s >= s_end) {
6759 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
6761 if (enc_utf8 == NULL) enc_utf8 = rb_utf8_encoding();
6762 if (*penc != enc_utf8) {
6763 *penc = enc_utf8;
6764 rb_enc_associate(undumped, enc_utf8);
6766 if (*s == '{') { /* handle \u{...} form */
6767 s++;
6768 for (;;) {
6769 if (s >= s_end) {
6770 rb_raise(rb_eRuntimeError, "unterminated Unicode escape");
6772 if (*s == '}') {
6773 s++;
6774 break;
6776 if (ISSPACE(*s)) {
6777 s++;
6778 continue;
6780 c = scan_hex(s, s_end-s, &hexlen);
6781 if (hexlen == 0 || hexlen > 6) {
6782 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
6784 if (c > 0x10ffff) {
6785 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint (too large)");
6787 if (0xd800 <= c && c <= 0xdfff) {
6788 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
6790 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
6791 rb_str_cat(undumped, (char *)buf, codelen);
6792 s += hexlen;
6795 else { /* handle \uXXXX form */
6796 c = scan_hex(s, 4, &hexlen);
6797 if (hexlen != 4) {
6798 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
6800 if (0xd800 <= c && c <= 0xdfff) {
6801 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
6803 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
6804 rb_str_cat(undumped, (char *)buf, codelen);
6805 s += hexlen;
6807 break;
6808 case 'x':
6809 if (*utf8) {
6810 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
6812 *binary = true;
6813 if (++s >= s_end) {
6814 rb_raise(rb_eRuntimeError, "invalid hex escape");
6816 *buf = scan_hex(s, 2, &hexlen);
6817 if (hexlen != 2) {
6818 rb_raise(rb_eRuntimeError, "invalid hex escape");
6820 rb_str_cat(undumped, (char *)buf, 1);
6821 s += hexlen;
6822 break;
6823 default:
6824 rb_str_cat(undumped, s-1, 2);
6825 s++;
6828 *ss = s;
6831 static VALUE rb_str_is_ascii_only_p(VALUE str);
6834 * call-seq:
6835 * undump -> string
6837 * Returns an unescaped version of +self+:
6839 * s_orig = "\f\x00\xff\\\"" # => "\f\u0000\xFF\\\""
6840 * s_dumped = s_orig.dump # => "\"\\f\\x00\\xFF\\\\\\\"\""
6841 * s_undumped = s_dumped.undump # => "\f\u0000\xFF\\\""
6842 * s_undumped == s_orig # => true
6844 * Related: String#dump (inverse of String#undump).
6848 static VALUE
6849 str_undump(VALUE str)
6851 const char *s = RSTRING_PTR(str);
6852 const char *s_end = RSTRING_END(str);
6853 rb_encoding *enc = rb_enc_get(str);
6854 VALUE undumped = rb_enc_str_new(s, 0L, enc);
6855 bool utf8 = false;
6856 bool binary = false;
6857 int w;
6859 rb_must_asciicompat(str);
6860 if (rb_str_is_ascii_only_p(str) == Qfalse) {
6861 rb_raise(rb_eRuntimeError, "non-ASCII character detected");
6863 if (!str_null_check(str, &w)) {
6864 rb_raise(rb_eRuntimeError, "string contains null byte");
6866 if (RSTRING_LEN(str) < 2) goto invalid_format;
6867 if (*s != '"') goto invalid_format;
6869 /* strip '"' at the start */
6870 s++;
6872 for (;;) {
6873 if (s >= s_end) {
6874 rb_raise(rb_eRuntimeError, "unterminated dumped string");
6877 if (*s == '"') {
6878 /* epilogue */
6879 s++;
6880 if (s == s_end) {
6881 /* ascii compatible dumped string */
6882 break;
6884 else {
6885 static const char force_encoding_suffix[] = ".force_encoding(\""; /* "\")" */
6886 static const char dup_suffix[] = ".dup";
6887 const char *encname;
6888 int encidx;
6889 ptrdiff_t size;
6891 /* check separately for strings dumped by older versions */
6892 size = sizeof(dup_suffix) - 1;
6893 if (s_end - s > size && memcmp(s, dup_suffix, size) == 0) s += size;
6895 size = sizeof(force_encoding_suffix) - 1;
6896 if (s_end - s <= size) goto invalid_format;
6897 if (memcmp(s, force_encoding_suffix, size) != 0) goto invalid_format;
6898 s += size;
6900 if (utf8) {
6901 rb_raise(rb_eRuntimeError, "dumped string contained Unicode escape but used force_encoding");
6904 encname = s;
6905 s = memchr(s, '"', s_end-s);
6906 size = s - encname;
6907 if (!s) goto invalid_format;
6908 if (s_end - s != 2) goto invalid_format;
6909 if (s[0] != '"' || s[1] != ')') goto invalid_format;
6911 encidx = rb_enc_find_index2(encname, (long)size);
6912 if (encidx < 0) {
6913 rb_raise(rb_eRuntimeError, "dumped string has unknown encoding name");
6915 rb_enc_associate_index(undumped, encidx);
6917 break;
6920 if (*s == '\\') {
6921 s++;
6922 if (s >= s_end) {
6923 rb_raise(rb_eRuntimeError, "invalid escape");
6925 undump_after_backslash(undumped, &s, s_end, &enc, &utf8, &binary);
6927 else {
6928 rb_str_cat(undumped, s++, 1);
6932 return undumped;
6933 invalid_format:
6934 rb_raise(rb_eRuntimeError, "invalid dumped string; not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form");
6937 static void
6938 rb_str_check_dummy_enc(rb_encoding *enc)
6940 if (rb_enc_dummy_p(enc)) {
6941 rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
6942 rb_enc_name(enc));
6946 static rb_encoding *
6947 str_true_enc(VALUE str)
6949 rb_encoding *enc = STR_ENC_GET(str);
6950 rb_str_check_dummy_enc(enc);
6951 return enc;
6954 static OnigCaseFoldType
6955 check_case_options(int argc, VALUE *argv, OnigCaseFoldType flags)
6957 if (argc==0)
6958 return flags;
6959 if (argc>2)
6960 rb_raise(rb_eArgError, "too many options");
6961 if (argv[0]==sym_turkic) {
6962 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
6963 if (argc==2) {
6964 if (argv[1]==sym_lithuanian)
6965 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
6966 else
6967 rb_raise(rb_eArgError, "invalid second option");
6970 else if (argv[0]==sym_lithuanian) {
6971 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
6972 if (argc==2) {
6973 if (argv[1]==sym_turkic)
6974 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
6975 else
6976 rb_raise(rb_eArgError, "invalid second option");
6979 else if (argc>1)
6980 rb_raise(rb_eArgError, "too many options");
6981 else if (argv[0]==sym_ascii)
6982 flags |= ONIGENC_CASE_ASCII_ONLY;
6983 else if (argv[0]==sym_fold) {
6984 if ((flags & (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE)) == ONIGENC_CASE_DOWNCASE)
6985 flags ^= ONIGENC_CASE_FOLD|ONIGENC_CASE_DOWNCASE;
6986 else
6987 rb_raise(rb_eArgError, "option :fold only allowed for downcasing");
6989 else
6990 rb_raise(rb_eArgError, "invalid option");
6991 return flags;
6994 static inline bool
6995 case_option_single_p(OnigCaseFoldType flags, rb_encoding *enc, VALUE str)
6997 if ((flags & ONIGENC_CASE_ASCII_ONLY) && (enc==rb_utf8_encoding() || rb_enc_mbmaxlen(enc) == 1))
6998 return true;
6999 return !(flags & ONIGENC_CASE_FOLD_TURKISH_AZERI) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT;
7002 /* 16 should be long enough to absorb any kind of single character length increase */
7003 #define CASE_MAPPING_ADDITIONAL_LENGTH 20
7004 #ifndef CASEMAP_DEBUG
7005 # define CASEMAP_DEBUG 0
7006 #endif
7008 struct mapping_buffer;
7009 typedef struct mapping_buffer {
7010 size_t capa;
7011 size_t used;
7012 struct mapping_buffer *next;
7013 OnigUChar space[FLEX_ARY_LEN];
7014 } mapping_buffer;
7016 static void
7017 mapping_buffer_free(void *p)
7019 mapping_buffer *previous_buffer;
7020 mapping_buffer *current_buffer = p;
7021 while (current_buffer) {
7022 previous_buffer = current_buffer;
7023 current_buffer = current_buffer->next;
7024 ruby_sized_xfree(previous_buffer, previous_buffer->capa);
7028 static const rb_data_type_t mapping_buffer_type = {
7029 "mapping_buffer",
7030 {0, mapping_buffer_free,}
7033 static VALUE
7034 rb_str_casemap(VALUE source, OnigCaseFoldType *flags, rb_encoding *enc)
7036 VALUE target;
7038 const OnigUChar *source_current, *source_end;
7039 int target_length = 0;
7040 VALUE buffer_anchor;
7041 mapping_buffer *current_buffer = 0;
7042 mapping_buffer **pre_buffer;
7043 size_t buffer_count = 0;
7044 int buffer_length_or_invalid;
7046 if (RSTRING_LEN(source) == 0) return str_duplicate(rb_cString, source);
7048 source_current = (OnigUChar*)RSTRING_PTR(source);
7049 source_end = (OnigUChar*)RSTRING_END(source);
7051 buffer_anchor = TypedData_Wrap_Struct(0, &mapping_buffer_type, 0);
7052 pre_buffer = (mapping_buffer **)&DATA_PTR(buffer_anchor);
7053 while (source_current < source_end) {
7054 /* increase multiplier using buffer count to converge quickly */
7055 size_t capa = (size_t)(source_end-source_current)*++buffer_count + CASE_MAPPING_ADDITIONAL_LENGTH;
7056 if (CASEMAP_DEBUG) {
7057 fprintf(stderr, "Buffer allocation, capa is %"PRIuSIZE"\n", capa); /* for tuning */
7059 current_buffer = xmalloc(offsetof(mapping_buffer, space) + capa);
7060 *pre_buffer = current_buffer;
7061 pre_buffer = &current_buffer->next;
7062 current_buffer->next = NULL;
7063 current_buffer->capa = capa;
7064 buffer_length_or_invalid = enc->case_map(flags,
7065 &source_current, source_end,
7066 current_buffer->space,
7067 current_buffer->space+current_buffer->capa,
7068 enc);
7069 if (buffer_length_or_invalid < 0) {
7070 current_buffer = DATA_PTR(buffer_anchor);
7071 DATA_PTR(buffer_anchor) = 0;
7072 mapping_buffer_free(current_buffer);
7073 rb_raise(rb_eArgError, "input string invalid");
7075 target_length += current_buffer->used = buffer_length_or_invalid;
7077 if (CASEMAP_DEBUG) {
7078 fprintf(stderr, "Buffer count is %"PRIuSIZE"\n", buffer_count); /* for tuning */
7081 if (buffer_count==1) {
7082 target = rb_str_new((const char*)current_buffer->space, target_length);
7084 else {
7085 char *target_current;
7087 target = rb_str_new(0, target_length);
7088 target_current = RSTRING_PTR(target);
7089 current_buffer = DATA_PTR(buffer_anchor);
7090 while (current_buffer) {
7091 memcpy(target_current, current_buffer->space, current_buffer->used);
7092 target_current += current_buffer->used;
7093 current_buffer = current_buffer->next;
7096 current_buffer = DATA_PTR(buffer_anchor);
7097 DATA_PTR(buffer_anchor) = 0;
7098 mapping_buffer_free(current_buffer);
7100 /* TODO: check about string terminator character */
7101 str_enc_copy(target, source);
7102 /*ENC_CODERANGE_SET(mapped, cr);*/
7104 return target;
7107 static VALUE
7108 rb_str_ascii_casemap(VALUE source, VALUE target, OnigCaseFoldType *flags, rb_encoding *enc)
7110 const OnigUChar *source_current, *source_end;
7111 OnigUChar *target_current, *target_end;
7112 long old_length = RSTRING_LEN(source);
7113 int length_or_invalid;
7115 if (old_length == 0) return Qnil;
7117 source_current = (OnigUChar*)RSTRING_PTR(source);
7118 source_end = (OnigUChar*)RSTRING_END(source);
7119 if (source == target) {
7120 target_current = (OnigUChar*)source_current;
7121 target_end = (OnigUChar*)source_end;
7123 else {
7124 target_current = (OnigUChar*)RSTRING_PTR(target);
7125 target_end = (OnigUChar*)RSTRING_END(target);
7128 length_or_invalid = onigenc_ascii_only_case_map(flags,
7129 &source_current, source_end,
7130 target_current, target_end, enc);
7131 if (length_or_invalid < 0)
7132 rb_raise(rb_eArgError, "input string invalid");
7133 if (CASEMAP_DEBUG && length_or_invalid != old_length) {
7134 fprintf(stderr, "problem with rb_str_ascii_casemap"
7135 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7136 rb_raise(rb_eArgError, "internal problem with rb_str_ascii_casemap"
7137 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7140 str_enc_copy(target, source);
7142 return target;
7145 static bool
7146 upcase_single(VALUE str)
7148 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
7149 bool modified = false;
7151 while (s < send) {
7152 unsigned int c = *(unsigned char*)s;
7154 if ('a' <= c && c <= 'z') {
7155 *s = 'A' + (c - 'a');
7156 modified = true;
7158 s++;
7160 return modified;
7164 * call-seq:
7165 * upcase!(*options) -> self or nil
7167 * Upcases the characters in +self+;
7168 * returns +self+ if any changes were made, +nil+ otherwise:
7170 * s = 'Hello World!' # => "Hello World!"
7171 * s.upcase! # => "HELLO WORLD!"
7172 * s # => "HELLO WORLD!"
7173 * s.upcase! # => nil
7175 * The casing may be affected by the given +options+;
7176 * see {Case Mapping}[doc/case_mapping_rdoc.html].
7178 * Related: String#upcase, String#downcase, String#downcase!.
7182 static VALUE
7183 rb_str_upcase_bang(int argc, VALUE *argv, VALUE str)
7185 rb_encoding *enc;
7186 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7188 flags = check_case_options(argc, argv, flags);
7189 str_modify_keep_cr(str);
7190 enc = str_true_enc(str);
7191 if (case_option_single_p(flags, enc, str)) {
7192 if (upcase_single(str))
7193 flags |= ONIGENC_CASE_MODIFIED;
7195 else if (flags&ONIGENC_CASE_ASCII_ONLY)
7196 rb_str_ascii_casemap(str, str, &flags, enc);
7197 else
7198 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7200 if (ONIGENC_CASE_MODIFIED&flags) return str;
7201 return Qnil;
7206 * call-seq:
7207 * upcase(*options) -> string
7209 * Returns a string containing the upcased characters in +self+:
7211 * s = 'Hello World!' # => "Hello World!"
7212 * s.upcase # => "HELLO WORLD!"
7214 * The casing may be affected by the given +options+;
7215 * see {Case Mapping}[doc/case_mapping_rdoc.html].
7217 * Related: String#upcase!, String#downcase, String#downcase!.
7221 static VALUE
7222 rb_str_upcase(int argc, VALUE *argv, VALUE str)
7224 rb_encoding *enc;
7225 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7226 VALUE ret;
7228 flags = check_case_options(argc, argv, flags);
7229 enc = str_true_enc(str);
7230 if (case_option_single_p(flags, enc, str)) {
7231 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
7232 str_enc_copy(ret, str);
7233 upcase_single(ret);
7235 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
7236 ret = rb_str_new(0, RSTRING_LEN(str));
7237 rb_str_ascii_casemap(str, ret, &flags, enc);
7239 else {
7240 ret = rb_str_casemap(str, &flags, enc);
7243 return ret;
7246 static bool
7247 downcase_single(VALUE str)
7249 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
7250 bool modified = false;
7252 while (s < send) {
7253 unsigned int c = *(unsigned char*)s;
7255 if ('A' <= c && c <= 'Z') {
7256 *s = 'a' + (c - 'A');
7257 modified = true;
7259 s++;
7262 return modified;
7266 * call-seq:
7267 * downcase!(*options) -> self or nil
7269 * Downcases the characters in +self+;
7270 * returns +self+ if any changes were made, +nil+ otherwise:
7272 * s = 'Hello World!' # => "Hello World!"
7273 * s.downcase! # => "hello world!"
7274 * s # => "hello world!"
7275 * s.downcase! # => nil
7277 * The casing may be affected by the given +options+;
7278 * see {Case Mapping}[doc/case_mapping_rdoc.html].
7280 * Related: String#downcase, String#upcase, String#upcase!.
7284 static VALUE
7285 rb_str_downcase_bang(int argc, VALUE *argv, VALUE str)
7287 rb_encoding *enc;
7288 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
7290 flags = check_case_options(argc, argv, flags);
7291 str_modify_keep_cr(str);
7292 enc = str_true_enc(str);
7293 if (case_option_single_p(flags, enc, str)) {
7294 if (downcase_single(str))
7295 flags |= ONIGENC_CASE_MODIFIED;
7297 else if (flags&ONIGENC_CASE_ASCII_ONLY)
7298 rb_str_ascii_casemap(str, str, &flags, enc);
7299 else
7300 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7302 if (ONIGENC_CASE_MODIFIED&flags) return str;
7303 return Qnil;
7308 * call-seq:
7309 * downcase(*options) -> string
7311 * Returns a string containing the downcased characters in +self+:
7313 * s = 'Hello World!' # => "Hello World!"
7314 * s.downcase # => "hello world!"
7316 * The casing may be affected by the given +options+;
7317 * see {Case Mapping}[doc/case_mapping_rdoc.html].
7319 * Related: String#downcase!, String#upcase, String#upcase!.
7323 static VALUE
7324 rb_str_downcase(int argc, VALUE *argv, VALUE str)
7326 rb_encoding *enc;
7327 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
7328 VALUE ret;
7330 flags = check_case_options(argc, argv, flags);
7331 enc = str_true_enc(str);
7332 if (case_option_single_p(flags, enc, str)) {
7333 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
7334 str_enc_copy(ret, str);
7335 downcase_single(ret);
7337 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
7338 ret = rb_str_new(0, RSTRING_LEN(str));
7339 rb_str_ascii_casemap(str, ret, &flags, enc);
7341 else {
7342 ret = rb_str_casemap(str, &flags, enc);
7345 return ret;
7350 * call-seq:
7351 * capitalize!(*options) -> self or nil
7353 * Upcases the first character in +self+;
7354 * downcases the remaining characters;
7355 * returns +self+ if any changes were made, +nil+ otherwise:
7357 * s = 'hello World!' # => "hello World!"
7358 * s.capitalize! # => "Hello world!"
7359 * s # => "Hello world!"
7360 * s.capitalize! # => nil
7362 * The casing may be affected by the given +options+;
7363 * see {Case Mapping}[doc/case_mapping_rdoc.html].
7365 * Related: String#capitalize.
7369 static VALUE
7370 rb_str_capitalize_bang(int argc, VALUE *argv, VALUE str)
7372 rb_encoding *enc;
7373 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
7375 flags = check_case_options(argc, argv, flags);
7376 str_modify_keep_cr(str);
7377 enc = str_true_enc(str);
7378 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
7379 if (flags&ONIGENC_CASE_ASCII_ONLY)
7380 rb_str_ascii_casemap(str, str, &flags, enc);
7381 else
7382 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7384 if (ONIGENC_CASE_MODIFIED&flags) return str;
7385 return Qnil;
7390 * call-seq:
7391 * capitalize(*options) -> string
7393 * Returns a string containing the characters in +self+;
7394 * the first character is upcased;
7395 * the remaining characters are downcased:
7397 * s = 'hello World!' # => "hello World!"
7398 * s.capitalize # => "Hello world!"
7400 * The casing may be affected by the given +options+;
7401 * see {Case Mapping}[doc/case_mapping_rdoc.html].
7403 * Related: String#capitalize!.
7407 static VALUE
7408 rb_str_capitalize(int argc, VALUE *argv, VALUE str)
7410 rb_encoding *enc;
7411 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
7412 VALUE ret;
7414 flags = check_case_options(argc, argv, flags);
7415 enc = str_true_enc(str);
7416 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str;
7417 if (flags&ONIGENC_CASE_ASCII_ONLY) {
7418 ret = rb_str_new(0, RSTRING_LEN(str));
7419 rb_str_ascii_casemap(str, ret, &flags, enc);
7421 else {
7422 ret = rb_str_casemap(str, &flags, enc);
7424 return ret;
7429 * call-seq:
7430 * swapcase!(*options) -> self or nil
7432 * Upcases each lowercase character in +self+;
7433 * downcases uppercase character;
7434 * returns +self+ if any changes were made, +nil+ otherwise:
7436 * s = 'Hello World!' # => "Hello World!"
7437 * s.swapcase! # => "hELLO wORLD!"
7438 * s # => "Hello World!"
7439 * ''.swapcase! # => nil
7441 * The casing may be affected by the given +options+;
7442 * see {Case Mapping}[doc/case_mapping_rdoc.html].
7444 * Related: String#swapcase.
7448 static VALUE
7449 rb_str_swapcase_bang(int argc, VALUE *argv, VALUE str)
7451 rb_encoding *enc;
7452 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
7454 flags = check_case_options(argc, argv, flags);
7455 str_modify_keep_cr(str);
7456 enc = str_true_enc(str);
7457 if (flags&ONIGENC_CASE_ASCII_ONLY)
7458 rb_str_ascii_casemap(str, str, &flags, enc);
7459 else
7460 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7462 if (ONIGENC_CASE_MODIFIED&flags) return str;
7463 return Qnil;
7468 * call-seq:
7469 * swapcase(*options) -> string
7471 * Returns a string containing the characters in +self+, with cases reversed;
7472 * each uppercase character is downcased;
7473 * each lowercase character is upcased:
7475 * s = 'Hello World!' # => "Hello World!"
7476 * s.swapcase # => "hELLO wORLD!"
7478 * The casing may be affected by the given +options+;
7479 * see {Case Mapping}[doc/case_mapping_rdoc.html].
7481 * Related: String#swapcase!.
7485 static VALUE
7486 rb_str_swapcase(int argc, VALUE *argv, VALUE str)
7488 rb_encoding *enc;
7489 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
7490 VALUE ret;
7492 flags = check_case_options(argc, argv, flags);
7493 enc = str_true_enc(str);
7494 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str_duplicate(rb_cString, str);
7495 if (flags&ONIGENC_CASE_ASCII_ONLY) {
7496 ret = rb_str_new(0, RSTRING_LEN(str));
7497 rb_str_ascii_casemap(str, ret, &flags, enc);
7499 else {
7500 ret = rb_str_casemap(str, &flags, enc);
7502 return ret;
7505 typedef unsigned char *USTR;
7507 struct tr {
7508 int gen;
7509 unsigned int now, max;
7510 char *p, *pend;
7513 static unsigned int
7514 trnext(struct tr *t, rb_encoding *enc)
7516 int n;
7518 for (;;) {
7519 nextpart:
7520 if (!t->gen) {
7521 if (t->p == t->pend) return -1;
7522 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '\\' && t->p + n < t->pend) {
7523 t->p += n;
7525 t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
7526 t->p += n;
7527 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '-' && t->p + n < t->pend) {
7528 t->p += n;
7529 if (t->p < t->pend) {
7530 unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
7531 t->p += n;
7532 if (t->now > c) {
7533 if (t->now < 0x80 && c < 0x80) {
7534 rb_raise(rb_eArgError,
7535 "invalid range \"%c-%c\" in string transliteration",
7536 t->now, c);
7538 else {
7539 rb_raise(rb_eArgError, "invalid range in string transliteration");
7541 continue; /* not reached */
7543 t->gen = 1;
7544 t->max = c;
7547 return t->now;
7549 else {
7550 while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) {
7551 if (t->now == t->max) {
7552 t->gen = 0;
7553 goto nextpart;
7556 if (t->now < t->max) {
7557 return t->now;
7559 else {
7560 t->gen = 0;
7561 return t->max;
7567 static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
7569 static VALUE
7570 tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
7572 const unsigned int errc = -1;
7573 unsigned int trans[256];
7574 rb_encoding *enc, *e1, *e2;
7575 struct tr trsrc, trrepl;
7576 int cflag = 0;
7577 unsigned int c, c0, last = 0;
7578 int modify = 0, i, l;
7579 unsigned char *s, *send;
7580 VALUE hash = 0;
7581 int singlebyte = single_byte_optimizable(str);
7582 int termlen;
7583 int cr;
7585 #define CHECK_IF_ASCII(c) \
7586 (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
7587 (cr = ENC_CODERANGE_VALID) : 0)
7589 StringValue(src);
7590 StringValue(repl);
7591 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
7592 if (RSTRING_LEN(repl) == 0) {
7593 return rb_str_delete_bang(1, &src, str);
7596 cr = ENC_CODERANGE(str);
7597 e1 = rb_enc_check(str, src);
7598 e2 = rb_enc_check(str, repl);
7599 if (e1 == e2) {
7600 enc = e1;
7602 else {
7603 enc = rb_enc_check(src, repl);
7605 trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
7606 if (RSTRING_LEN(src) > 1 &&
7607 rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
7608 trsrc.p + l < trsrc.pend) {
7609 cflag = 1;
7610 trsrc.p += l;
7612 trrepl.p = RSTRING_PTR(repl);
7613 trrepl.pend = trrepl.p + RSTRING_LEN(repl);
7614 trsrc.gen = trrepl.gen = 0;
7615 trsrc.now = trrepl.now = 0;
7616 trsrc.max = trrepl.max = 0;
7618 if (cflag) {
7619 for (i=0; i<256; i++) {
7620 trans[i] = 1;
7622 while ((c = trnext(&trsrc, enc)) != errc) {
7623 if (c < 256) {
7624 trans[c] = errc;
7626 else {
7627 if (!hash) hash = rb_hash_new();
7628 rb_hash_aset(hash, UINT2NUM(c), Qtrue);
7631 while ((c = trnext(&trrepl, enc)) != errc)
7632 /* retrieve last replacer */;
7633 last = trrepl.now;
7634 for (i=0; i<256; i++) {
7635 if (trans[i] != errc) {
7636 trans[i] = last;
7640 else {
7641 unsigned int r;
7643 for (i=0; i<256; i++) {
7644 trans[i] = errc;
7646 while ((c = trnext(&trsrc, enc)) != errc) {
7647 r = trnext(&trrepl, enc);
7648 if (r == errc) r = trrepl.now;
7649 if (c < 256) {
7650 trans[c] = r;
7651 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
7653 else {
7654 if (!hash) hash = rb_hash_new();
7655 rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
7660 if (cr == ENC_CODERANGE_VALID && rb_enc_asciicompat(e1))
7661 cr = ENC_CODERANGE_7BIT;
7662 str_modify_keep_cr(str);
7663 s = (unsigned char *)RSTRING_PTR(str); send = (unsigned char *)RSTRING_END(str);
7664 termlen = rb_enc_mbminlen(enc);
7665 if (sflag) {
7666 int clen, tlen;
7667 long offset, max = RSTRING_LEN(str);
7668 unsigned int save = -1;
7669 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
7671 while (s < send) {
7672 int may_modify = 0;
7674 c0 = c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, e1);
7675 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
7677 s += clen;
7678 if (c < 256) {
7679 c = trans[c];
7681 else if (hash) {
7682 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
7683 if (NIL_P(tmp)) {
7684 if (cflag) c = last;
7685 else c = errc;
7687 else if (cflag) c = errc;
7688 else c = NUM2INT(tmp);
7690 else {
7691 c = errc;
7693 if (c != (unsigned int)-1) {
7694 if (save == c) {
7695 CHECK_IF_ASCII(c);
7696 continue;
7698 save = c;
7699 tlen = rb_enc_codelen(c, enc);
7700 modify = 1;
7702 else {
7703 save = -1;
7704 c = c0;
7705 if (enc != e1) may_modify = 1;
7707 if ((offset = t - buf) + tlen > max) {
7708 size_t MAYBE_UNUSED(old) = max + termlen;
7709 max = offset + tlen + (send - s);
7710 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
7711 t = buf + offset;
7713 rb_enc_mbcput(c, t, enc);
7714 if (may_modify && memcmp(s, t, tlen) != 0) {
7715 modify = 1;
7717 CHECK_IF_ASCII(c);
7718 t += tlen;
7720 if (!STR_EMBED_P(str)) {
7721 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
7723 TERM_FILL((char *)t, termlen);
7724 RSTRING(str)->as.heap.ptr = (char *)buf;
7725 RSTRING(str)->as.heap.len = t - buf;
7726 STR_SET_NOEMBED(str);
7727 RSTRING(str)->as.heap.aux.capa = max;
7729 else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
7730 while (s < send) {
7731 c = (unsigned char)*s;
7732 if (trans[c] != errc) {
7733 if (!cflag) {
7734 c = trans[c];
7735 *s = c;
7736 modify = 1;
7738 else {
7739 *s = last;
7740 modify = 1;
7743 CHECK_IF_ASCII(c);
7744 s++;
7747 else {
7748 int clen, tlen;
7749 long offset, max = (long)((send - s) * 1.2);
7750 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
7752 while (s < send) {
7753 int may_modify = 0;
7754 c0 = c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, e1);
7755 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
7757 if (c < 256) {
7758 c = trans[c];
7760 else if (hash) {
7761 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
7762 if (NIL_P(tmp)) {
7763 if (cflag) c = last;
7764 else c = errc;
7766 else if (cflag) c = errc;
7767 else c = NUM2INT(tmp);
7769 else {
7770 c = cflag ? last : errc;
7772 if (c != errc) {
7773 tlen = rb_enc_codelen(c, enc);
7774 modify = 1;
7776 else {
7777 c = c0;
7778 if (enc != e1) may_modify = 1;
7780 if ((offset = t - buf) + tlen > max) {
7781 size_t MAYBE_UNUSED(old) = max + termlen;
7782 max = offset + tlen + (long)((send - s) * 1.2);
7783 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
7784 t = buf + offset;
7786 if (s != t) {
7787 rb_enc_mbcput(c, t, enc);
7788 if (may_modify && memcmp(s, t, tlen) != 0) {
7789 modify = 1;
7792 CHECK_IF_ASCII(c);
7793 s += clen;
7794 t += tlen;
7796 if (!STR_EMBED_P(str)) {
7797 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
7799 TERM_FILL((char *)t, termlen);
7800 RSTRING(str)->as.heap.ptr = (char *)buf;
7801 RSTRING(str)->as.heap.len = t - buf;
7802 STR_SET_NOEMBED(str);
7803 RSTRING(str)->as.heap.aux.capa = max;
7806 if (modify) {
7807 if (cr != ENC_CODERANGE_BROKEN)
7808 ENC_CODERANGE_SET(str, cr);
7809 rb_enc_associate(str, enc);
7810 return str;
7812 return Qnil;
7817 * call-seq:
7818 * str.tr!(from_str, to_str) -> str or nil
7820 * Translates <i>str</i> in place, using the same rules as
7821 * String#tr. Returns <i>str</i>, or <code>nil</code> if no changes
7822 * were made.
7825 static VALUE
7826 rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
7828 return tr_trans(str, src, repl, 0);
7833 * call-seq:
7834 * str.tr(from_str, to_str) => new_str
7836 * Returns a copy of +str+ with the characters in +from_str+ replaced by the
7837 * corresponding characters in +to_str+. If +to_str+ is shorter than
7838 * +from_str+, it is padded with its last character in order to maintain the
7839 * correspondence.
7841 * "hello".tr('el', 'ip') #=> "hippo"
7842 * "hello".tr('aeiou', '*') #=> "h*ll*"
7843 * "hello".tr('aeiou', 'AA*') #=> "hAll*"
7845 * Both strings may use the <code>c1-c2</code> notation to denote ranges of
7846 * characters, and +from_str+ may start with a <code>^</code>, which denotes
7847 * all characters except those listed.
7849 * "hello".tr('a-y', 'b-z') #=> "ifmmp"
7850 * "hello".tr('^aeiou', '*') #=> "*e**o"
7852 * The backslash character <code>\\</code> can be used to escape
7853 * <code>^</code> or <code>-</code> and is otherwise ignored unless it
7854 * appears at the end of a range or the end of the +from_str+ or +to_str+:
7856 * "hello^world".tr("\\^aeiou", "*") #=> "h*ll**w*rld"
7857 * "hello-world".tr("a\\-eo", "*") #=> "h*ll**w*rld"
7859 * "hello\r\nworld".tr("\r", "") #=> "hello\nworld"
7860 * "hello\r\nworld".tr("\\r", "") #=> "hello\r\nwold"
7861 * "hello\r\nworld".tr("\\\r", "") #=> "hello\nworld"
7863 * "X['\\b']".tr("X\\", "") #=> "['b']"
7864 * "X['\\b']".tr("X-\\]", "") #=> "'b'"
7867 static VALUE
7868 rb_str_tr(VALUE str, VALUE src, VALUE repl)
7870 str = str_duplicate(rb_cString, str);
7871 tr_trans(str, src, repl, 0);
7872 return str;
7875 #define TR_TABLE_MAX (UCHAR_MAX+1)
7876 #define TR_TABLE_SIZE (TR_TABLE_MAX+1)
7877 static void
7878 tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first,
7879 VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
7881 const unsigned int errc = -1;
7882 char buf[TR_TABLE_MAX];
7883 struct tr tr;
7884 unsigned int c;
7885 VALUE table = 0, ptable = 0;
7886 int i, l, cflag = 0;
7888 tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
7889 tr.gen = tr.now = tr.max = 0;
7891 if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
7892 cflag = 1;
7893 tr.p += l;
7895 if (first) {
7896 for (i=0; i<TR_TABLE_MAX; i++) {
7897 stable[i] = 1;
7899 stable[TR_TABLE_MAX] = cflag;
7901 else if (stable[TR_TABLE_MAX] && !cflag) {
7902 stable[TR_TABLE_MAX] = 0;
7904 for (i=0; i<TR_TABLE_MAX; i++) {
7905 buf[i] = cflag;
7908 while ((c = trnext(&tr, enc)) != errc) {
7909 if (c < TR_TABLE_MAX) {
7910 buf[(unsigned char)c] = !cflag;
7912 else {
7913 VALUE key = UINT2NUM(c);
7915 if (!table && (first || *tablep || stable[TR_TABLE_MAX])) {
7916 if (cflag) {
7917 ptable = *ctablep;
7918 table = ptable ? ptable : rb_hash_new();
7919 *ctablep = table;
7921 else {
7922 table = rb_hash_new();
7923 ptable = *tablep;
7924 *tablep = table;
7927 if (table && (!ptable || (cflag ^ !NIL_P(rb_hash_aref(ptable, key))))) {
7928 rb_hash_aset(table, key, Qtrue);
7932 for (i=0; i<TR_TABLE_MAX; i++) {
7933 stable[i] = stable[i] && buf[i];
7935 if (!table && !cflag) {
7936 *tablep = 0;
7941 static int
7942 tr_find(unsigned int c, const char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
7944 if (c < TR_TABLE_MAX) {
7945 return table[c] != 0;
7947 else {
7948 VALUE v = UINT2NUM(c);
7950 if (del) {
7951 if (!NIL_P(rb_hash_lookup(del, v)) &&
7952 (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) {
7953 return TRUE;
7956 else if (nodel && !NIL_P(rb_hash_lookup(nodel, v))) {
7957 return FALSE;
7959 return table[TR_TABLE_MAX] ? TRUE : FALSE;
7964 * call-seq:
7965 * str.delete!([other_str]+) -> str or nil
7967 * Performs a <code>delete</code> operation in place, returning <i>str</i>, or
7968 * <code>nil</code> if <i>str</i> was not modified.
7971 static VALUE
7972 rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
7974 char squeez[TR_TABLE_SIZE];
7975 rb_encoding *enc = 0;
7976 char *s, *send, *t;
7977 VALUE del = 0, nodel = 0;
7978 int modify = 0;
7979 int i, ascompat, cr;
7981 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
7982 rb_check_arity(argc, 1, UNLIMITED_ARGUMENTS);
7983 for (i=0; i<argc; i++) {
7984 VALUE s = argv[i];
7986 StringValue(s);
7987 enc = rb_enc_check(str, s);
7988 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
7991 str_modify_keep_cr(str);
7992 ascompat = rb_enc_asciicompat(enc);
7993 s = t = RSTRING_PTR(str);
7994 send = RSTRING_END(str);
7995 cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
7996 while (s < send) {
7997 unsigned int c;
7998 int clen;
8000 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
8001 if (squeez[c]) {
8002 modify = 1;
8004 else {
8005 if (t != s) *t = c;
8006 t++;
8008 s++;
8010 else {
8011 c = rb_enc_codepoint_len(s, send, &clen, enc);
8013 if (tr_find(c, squeez, del, nodel)) {
8014 modify = 1;
8016 else {
8017 if (t != s) rb_enc_mbcput(c, t, enc);
8018 t += clen;
8019 if (cr == ENC_CODERANGE_7BIT) cr = ENC_CODERANGE_VALID;
8021 s += clen;
8024 TERM_FILL(t, TERM_LEN(str));
8025 STR_SET_LEN(str, t - RSTRING_PTR(str));
8026 ENC_CODERANGE_SET(str, cr);
8028 if (modify) return str;
8029 return Qnil;
8034 * call-seq:
8035 * str.delete([other_str]+) -> new_str
8037 * Returns a copy of <i>str</i> with all characters in the intersection of its
8038 * arguments deleted. Uses the same rules for building the set of characters as
8039 * String#count.
8041 * "hello".delete "l","lo" #=> "heo"
8042 * "hello".delete "lo" #=> "he"
8043 * "hello".delete "aeiou", "^e" #=> "hell"
8044 * "hello".delete "ej-m" #=> "ho"
8047 static VALUE
8048 rb_str_delete(int argc, VALUE *argv, VALUE str)
8050 str = str_duplicate(rb_cString, str);
8051 rb_str_delete_bang(argc, argv, str);
8052 return str;
8057 * call-seq:
8058 * str.squeeze!([other_str]*) -> str or nil
8060 * Squeezes <i>str</i> in place, returning either <i>str</i>, or
8061 * <code>nil</code> if no changes were made.
8064 static VALUE
8065 rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
8067 char squeez[TR_TABLE_SIZE];
8068 rb_encoding *enc = 0;
8069 VALUE del = 0, nodel = 0;
8070 unsigned char *s, *send, *t;
8071 int i, modify = 0;
8072 int ascompat, singlebyte = single_byte_optimizable(str);
8073 unsigned int save;
8075 if (argc == 0) {
8076 enc = STR_ENC_GET(str);
8078 else {
8079 for (i=0; i<argc; i++) {
8080 VALUE s = argv[i];
8082 StringValue(s);
8083 enc = rb_enc_check(str, s);
8084 if (singlebyte && !single_byte_optimizable(s))
8085 singlebyte = 0;
8086 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8090 str_modify_keep_cr(str);
8091 s = t = (unsigned char *)RSTRING_PTR(str);
8092 if (!s || RSTRING_LEN(str) == 0) return Qnil;
8093 send = (unsigned char *)RSTRING_END(str);
8094 save = -1;
8095 ascompat = rb_enc_asciicompat(enc);
8097 if (singlebyte) {
8098 while (s < send) {
8099 unsigned int c = *s++;
8100 if (c != save || (argc > 0 && !squeez[c])) {
8101 *t++ = save = c;
8105 else {
8106 while (s < send) {
8107 unsigned int c;
8108 int clen;
8110 if (ascompat && (c = *s) < 0x80) {
8111 if (c != save || (argc > 0 && !squeez[c])) {
8112 *t++ = save = c;
8114 s++;
8116 else {
8117 c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, enc);
8119 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
8120 if (t != s) rb_enc_mbcput(c, t, enc);
8121 save = c;
8122 t += clen;
8124 s += clen;
8129 TERM_FILL((char *)t, TERM_LEN(str));
8130 if ((char *)t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
8131 STR_SET_LEN(str, (char *)t - RSTRING_PTR(str));
8132 modify = 1;
8135 if (modify) return str;
8136 return Qnil;
8141 * call-seq:
8142 * str.squeeze([other_str]*) -> new_str
8144 * Builds a set of characters from the <i>other_str</i> parameter(s)
8145 * using the procedure described for String#count. Returns a new
8146 * string where runs of the same character that occur in this set are
8147 * replaced by a single character. If no arguments are given, all
8148 * runs of identical characters are replaced by a single character.
8150 * "yellow moon".squeeze #=> "yelow mon"
8151 * " now is the".squeeze(" ") #=> " now is the"
8152 * "putters shoot balls".squeeze("m-z") #=> "puters shot balls"
8155 static VALUE
8156 rb_str_squeeze(int argc, VALUE *argv, VALUE str)
8158 str = str_duplicate(rb_cString, str);
8159 rb_str_squeeze_bang(argc, argv, str);
8160 return str;
8165 * call-seq:
8166 * str.tr_s!(from_str, to_str) -> str or nil
8168 * Performs String#tr_s processing on <i>str</i> in place,
8169 * returning <i>str</i>, or <code>nil</code> if no changes were made.
8172 static VALUE
8173 rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
8175 return tr_trans(str, src, repl, 1);
8180 * call-seq:
8181 * str.tr_s(from_str, to_str) -> new_str
8183 * Processes a copy of <i>str</i> as described under String#tr, then
8184 * removes duplicate characters in regions that were affected by the
8185 * translation.
8187 * "hello".tr_s('l', 'r') #=> "hero"
8188 * "hello".tr_s('el', '*') #=> "h*o"
8189 * "hello".tr_s('el', 'hx') #=> "hhxo"
8192 static VALUE
8193 rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
8195 str = str_duplicate(rb_cString, str);
8196 tr_trans(str, src, repl, 1);
8197 return str;
8202 * call-seq:
8203 * str.count([other_str]+) -> integer
8205 * Each +other_str+ parameter defines a set of characters to count. The
8206 * intersection of these sets defines the characters to count in +str+. Any
8207 * +other_str+ that starts with a caret <code>^</code> is negated. The
8208 * sequence <code>c1-c2</code> means all characters between c1 and c2. The
8209 * backslash character <code>\\</code> can be used to escape <code>^</code> or
8210 * <code>-</code> and is otherwise ignored unless it appears at the end of a
8211 * sequence or the end of a +other_str+.
8213 * a = "hello world"
8214 * a.count "lo" #=> 5
8215 * a.count "lo", "o" #=> 2
8216 * a.count "hello", "^l" #=> 4
8217 * a.count "ej-m" #=> 4
8219 * "hello^world".count "\\^aeiou" #=> 4
8220 * "hello-world".count "a\\-eo" #=> 4
8222 * c = "hello world\\r\\n"
8223 * c.count "\\" #=> 2
8224 * c.count "\\A" #=> 0
8225 * c.count "X-\\w" #=> 3
8228 static VALUE
8229 rb_str_count(int argc, VALUE *argv, VALUE str)
8231 char table[TR_TABLE_SIZE];
8232 rb_encoding *enc = 0;
8233 VALUE del = 0, nodel = 0, tstr;
8234 char *s, *send;
8235 int i;
8236 int ascompat;
8237 size_t n = 0;
8239 rb_check_arity(argc, 1, UNLIMITED_ARGUMENTS);
8241 tstr = argv[0];
8242 StringValue(tstr);
8243 enc = rb_enc_check(str, tstr);
8244 if (argc == 1) {
8245 const char *ptstr;
8246 if (RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
8247 (ptstr = RSTRING_PTR(tstr),
8248 ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc, (const unsigned char *)ptstr, (const unsigned char *)ptstr+1)) &&
8249 !is_broken_string(str)) {
8250 int clen;
8251 unsigned char c = rb_enc_codepoint_len(ptstr, ptstr+1, &clen, enc);
8253 s = RSTRING_PTR(str);
8254 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
8255 send = RSTRING_END(str);
8256 while (s < send) {
8257 if (*(unsigned char*)s++ == c) n++;
8259 return SIZET2NUM(n);
8263 tr_setup_table(tstr, table, TRUE, &del, &nodel, enc);
8264 for (i=1; i<argc; i++) {
8265 tstr = argv[i];
8266 StringValue(tstr);
8267 enc = rb_enc_check(str, tstr);
8268 tr_setup_table(tstr, table, FALSE, &del, &nodel, enc);
8271 s = RSTRING_PTR(str);
8272 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
8273 send = RSTRING_END(str);
8274 ascompat = rb_enc_asciicompat(enc);
8275 while (s < send) {
8276 unsigned int c;
8278 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
8279 if (table[c]) {
8280 n++;
8282 s++;
8284 else {
8285 int clen;
8286 c = rb_enc_codepoint_len(s, send, &clen, enc);
8287 if (tr_find(c, table, del, nodel)) {
8288 n++;
8290 s += clen;
8294 return SIZET2NUM(n);
8297 static VALUE
8298 rb_fs_check(VALUE val)
8300 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING) && !RB_TYPE_P(val, T_REGEXP)) {
8301 val = rb_check_string_type(val);
8302 if (NIL_P(val)) return 0;
8304 return val;
8307 static const char isspacetable[256] = {
8308 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
8309 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8310 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8311 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8312 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8313 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8314 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8315 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8316 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8317 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8318 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8319 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8320 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8321 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8322 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8323 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
8326 #define ascii_isspace(c) isspacetable[(unsigned char)(c)]
8328 static long
8329 split_string(VALUE result, VALUE str, long beg, long len, long empty_count)
8331 if (empty_count >= 0 && len == 0) {
8332 return empty_count + 1;
8334 if (empty_count > 0) {
8335 /* make different substrings */
8336 if (result) {
8337 do {
8338 rb_ary_push(result, str_new_empty_String(str));
8339 } while (--empty_count > 0);
8341 else {
8342 do {
8343 rb_yield(str_new_empty_String(str));
8344 } while (--empty_count > 0);
8347 str = rb_str_subseq(str, beg, len);
8348 if (result) {
8349 rb_ary_push(result, str);
8351 else {
8352 rb_yield(str);
8354 return empty_count;
8357 typedef enum {
8358 SPLIT_TYPE_AWK, SPLIT_TYPE_STRING, SPLIT_TYPE_REGEXP, SPLIT_TYPE_CHARS
8359 } split_type_t;
8361 static split_type_t
8362 literal_split_pattern(VALUE spat, split_type_t default_type)
8364 rb_encoding *enc = STR_ENC_GET(spat);
8365 const char *ptr;
8366 long len;
8367 RSTRING_GETMEM(spat, ptr, len);
8368 if (len == 0) {
8369 /* Special case - split into chars */
8370 return SPLIT_TYPE_CHARS;
8372 else if (rb_enc_asciicompat(enc)) {
8373 if (len == 1 && ptr[0] == ' ') {
8374 return SPLIT_TYPE_AWK;
8377 else {
8378 int l;
8379 if (rb_enc_ascget(ptr, ptr + len, &l, enc) == ' ' && len == l) {
8380 return SPLIT_TYPE_AWK;
8383 return default_type;
8387 * call-seq:
8388 * str.split(pattern=nil, [limit]) -> an_array
8389 * str.split(pattern=nil, [limit]) {|sub| block } -> str
8391 * Divides <i>str</i> into substrings based on a delimiter, returning an array
8392 * of these substrings.
8394 * If <i>pattern</i> is a String, then its contents are used as
8395 * the delimiter when splitting <i>str</i>. If <i>pattern</i> is a single
8396 * space, <i>str</i> is split on whitespace, with leading and trailing
8397 * whitespace and runs of contiguous whitespace characters ignored.
8399 * If <i>pattern</i> is a Regexp, <i>str</i> is divided where the
8400 * pattern matches. Whenever the pattern matches a zero-length string,
8401 * <i>str</i> is split into individual characters. If <i>pattern</i> contains
8402 * groups, the respective matches will be returned in the array as well.
8404 * If <i>pattern</i> is <code>nil</code>, the value of <code>$;</code> is used.
8405 * If <code>$;</code> is <code>nil</code> (which is the default), <i>str</i> is
8406 * split on whitespace as if ' ' were specified.
8408 * If the <i>limit</i> parameter is omitted, trailing null fields are
8409 * suppressed. If <i>limit</i> is a positive number, at most that number
8410 * of split substrings will be returned (captured groups will be returned
8411 * as well, but are not counted towards the limit).
8412 * If <i>limit</i> is <code>1</code>, the entire
8413 * string is returned as the only entry in an array. If negative, there is no
8414 * limit to the number of fields returned, and trailing null fields are not
8415 * suppressed.
8417 * When the input +str+ is empty an empty Array is returned as the string is
8418 * considered to have no fields to split.
8420 * " now's the time ".split #=> ["now's", "the", "time"]
8421 * " now's the time ".split(' ') #=> ["now's", "the", "time"]
8422 * " now's the time".split(/ /) #=> ["", "now's", "", "the", "time"]
8423 * "1, 2.34,56, 7".split(%r{,\s*}) #=> ["1", "2.34", "56", "7"]
8424 * "hello".split(//) #=> ["h", "e", "l", "l", "o"]
8425 * "hello".split(//, 3) #=> ["h", "e", "llo"]
8426 * "hi mom".split(%r{\s*}) #=> ["h", "i", "m", "o", "m"]
8428 * "mellow yellow".split("ello") #=> ["m", "w y", "w"]
8429 * "1,2,,3,4,,".split(',') #=> ["1", "2", "", "3", "4"]
8430 * "1,2,,3,4,,".split(',', 4) #=> ["1", "2", "", "3,4,,"]
8431 * "1,2,,3,4,,".split(',', -4) #=> ["1", "2", "", "3", "4", "", ""]
8433 * "1:2:3".split(/(:)()()/, 2) #=> ["1", ":", "", "", "2:3"]
8435 * "".split(',', -1) #=> []
8437 * If a block is given, invoke the block with each split substring.
8441 static VALUE
8442 rb_str_split_m(int argc, VALUE *argv, VALUE str)
8444 rb_encoding *enc;
8445 VALUE spat;
8446 VALUE limit;
8447 split_type_t split_type;
8448 long beg, end, i = 0, empty_count = -1;
8449 int lim = 0;
8450 VALUE result, tmp;
8452 result = rb_block_given_p() ? Qfalse : Qnil;
8453 if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
8454 lim = NUM2INT(limit);
8455 if (lim <= 0) limit = Qnil;
8456 else if (lim == 1) {
8457 if (RSTRING_LEN(str) == 0)
8458 return result ? rb_ary_new2(0) : str;
8459 tmp = str_duplicate(rb_cString, str);
8460 if (!result) {
8461 rb_yield(tmp);
8462 return str;
8464 return rb_ary_new3(1, tmp);
8466 i = 1;
8468 if (NIL_P(limit) && !lim) empty_count = 0;
8470 enc = STR_ENC_GET(str);
8471 split_type = SPLIT_TYPE_REGEXP;
8472 if (!NIL_P(spat)) {
8473 spat = get_pat_quoted(spat, 0);
8475 else if (NIL_P(spat = rb_fs)) {
8476 split_type = SPLIT_TYPE_AWK;
8478 else if (!(spat = rb_fs_check(spat))) {
8479 rb_raise(rb_eTypeError, "value of $; must be String or Regexp");
8481 else {
8482 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$; is set to non-nil value");
8484 if (split_type != SPLIT_TYPE_AWK) {
8485 switch (BUILTIN_TYPE(spat)) {
8486 case T_REGEXP:
8487 rb_reg_options(spat); /* check if uninitialized */
8488 tmp = RREGEXP_SRC(spat);
8489 split_type = literal_split_pattern(tmp, SPLIT_TYPE_REGEXP);
8490 if (split_type == SPLIT_TYPE_AWK) {
8491 spat = tmp;
8492 split_type = SPLIT_TYPE_STRING;
8494 break;
8496 case T_STRING:
8497 mustnot_broken(spat);
8498 split_type = literal_split_pattern(spat, SPLIT_TYPE_STRING);
8499 break;
8501 default:
8502 UNREACHABLE_RETURN(Qnil);
8506 #define SPLIT_STR(beg, len) (empty_count = split_string(result, str, beg, len, empty_count))
8508 if (result) result = rb_ary_new();
8509 beg = 0;
8510 char *ptr = RSTRING_PTR(str);
8511 char *eptr = RSTRING_END(str);
8512 if (split_type == SPLIT_TYPE_AWK) {
8513 char *bptr = ptr;
8514 int skip = 1;
8515 unsigned int c;
8517 end = beg;
8518 if (is_ascii_string(str)) {
8519 while (ptr < eptr) {
8520 c = (unsigned char)*ptr++;
8521 if (skip) {
8522 if (ascii_isspace(c)) {
8523 beg = ptr - bptr;
8525 else {
8526 end = ptr - bptr;
8527 skip = 0;
8528 if (!NIL_P(limit) && lim <= i) break;
8531 else if (ascii_isspace(c)) {
8532 SPLIT_STR(beg, end-beg);
8533 skip = 1;
8534 beg = ptr - bptr;
8535 if (!NIL_P(limit)) ++i;
8537 else {
8538 end = ptr - bptr;
8542 else {
8543 while (ptr < eptr) {
8544 int n;
8546 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
8547 ptr += n;
8548 if (skip) {
8549 if (rb_isspace(c)) {
8550 beg = ptr - bptr;
8552 else {
8553 end = ptr - bptr;
8554 skip = 0;
8555 if (!NIL_P(limit) && lim <= i) break;
8558 else if (rb_isspace(c)) {
8559 SPLIT_STR(beg, end-beg);
8560 skip = 1;
8561 beg = ptr - bptr;
8562 if (!NIL_P(limit)) ++i;
8564 else {
8565 end = ptr - bptr;
8570 else if (split_type == SPLIT_TYPE_STRING) {
8571 char *str_start = ptr;
8572 char *substr_start = ptr;
8573 char *sptr = RSTRING_PTR(spat);
8574 long slen = RSTRING_LEN(spat);
8576 mustnot_broken(str);
8577 enc = rb_enc_check(str, spat);
8578 while (ptr < eptr &&
8579 (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
8580 /* Check we are at the start of a char */
8581 char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
8582 if (t != ptr + end) {
8583 ptr = t;
8584 continue;
8586 SPLIT_STR(substr_start - str_start, (ptr+end) - substr_start);
8587 ptr += end + slen;
8588 substr_start = ptr;
8589 if (!NIL_P(limit) && lim <= ++i) break;
8591 beg = ptr - str_start;
8593 else if (split_type == SPLIT_TYPE_CHARS) {
8594 char *str_start = ptr;
8595 int n;
8597 mustnot_broken(str);
8598 enc = rb_enc_get(str);
8599 while (ptr < eptr &&
8600 (n = rb_enc_precise_mbclen(ptr, eptr, enc)) > 0) {
8601 SPLIT_STR(ptr - str_start, n);
8602 ptr += n;
8603 if (!NIL_P(limit) && lim <= ++i) break;
8605 beg = ptr - str_start;
8607 else {
8608 long len = RSTRING_LEN(str);
8609 long start = beg;
8610 long idx;
8611 int last_null = 0;
8612 struct re_registers *regs;
8613 VALUE match = 0;
8615 for (; rb_reg_search(spat, str, start, 0) >= 0;
8616 (match ? (rb_match_unbusy(match), rb_backref_set(match)) : (void)0)) {
8617 match = rb_backref_get();
8618 if (!result) rb_match_busy(match);
8619 regs = RMATCH_REGS(match);
8620 end = BEG(0);
8621 if (start == end && BEG(0) == END(0)) {
8622 if (!ptr) {
8623 SPLIT_STR(0, 0);
8624 break;
8626 else if (last_null == 1) {
8627 SPLIT_STR(beg, rb_enc_fast_mbclen(ptr+beg, eptr, enc));
8628 beg = start;
8630 else {
8631 if (start == len)
8632 start++;
8633 else
8634 start += rb_enc_fast_mbclen(ptr+start,eptr,enc);
8635 last_null = 1;
8636 continue;
8639 else {
8640 SPLIT_STR(beg, end-beg);
8641 beg = start = END(0);
8643 last_null = 0;
8645 for (idx=1; idx < regs->num_regs; idx++) {
8646 if (BEG(idx) == -1) continue;
8647 SPLIT_STR(BEG(idx), END(idx)-BEG(idx));
8649 if (!NIL_P(limit) && lim <= ++i) break;
8651 if (match) rb_match_unbusy(match);
8653 if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
8654 SPLIT_STR(beg, RSTRING_LEN(str)-beg);
8657 return result ? result : str;
8660 VALUE
8661 rb_str_split(VALUE str, const char *sep0)
8663 VALUE sep;
8665 StringValue(str);
8666 sep = rb_str_new_cstr(sep0);
8667 return rb_str_split_m(1, &sep, str);
8670 #define WANTARRAY(m, size) (!rb_block_given_p() ? rb_ary_new_capa(size) : 0)
8672 static inline int
8673 enumerator_element(VALUE ary, VALUE e)
8675 if (ary) {
8676 rb_ary_push(ary, e);
8677 return 0;
8679 else {
8680 rb_yield(e);
8681 return 1;
8685 #define ENUM_ELEM(ary, e) enumerator_element(ary, e)
8687 static const char *
8688 chomp_newline(const char *p, const char *e, rb_encoding *enc)
8690 const char *prev = rb_enc_prev_char(p, e, e, enc);
8691 if (rb_enc_is_newline(prev, e, enc)) {
8692 e = prev;
8693 prev = rb_enc_prev_char(p, e, e, enc);
8694 if (prev && rb_enc_ascget(prev, e, NULL, enc) == '\r')
8695 e = prev;
8697 return e;
8700 static VALUE
8701 get_rs(void)
8703 VALUE rs = rb_rs;
8704 if (!NIL_P(rs) &&
8705 (!RB_TYPE_P(rs, T_STRING) ||
8706 RSTRING_LEN(rs) != 1 ||
8707 RSTRING_PTR(rs)[0] != '\n')) {
8708 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$/ is set to non-default value");
8710 return rs;
8713 #define rb_rs get_rs()
8715 static VALUE
8716 rb_str_enumerate_lines(int argc, VALUE *argv, VALUE str, VALUE ary)
8718 rb_encoding *enc;
8719 VALUE line, rs, orig = str, opts = Qnil, chomp = Qfalse;
8720 const char *ptr, *pend, *subptr, *subend, *rsptr, *hit, *adjusted;
8721 long pos, len, rslen;
8722 int rsnewline = 0;
8724 if (rb_scan_args(argc, argv, "01:", &rs, &opts) == 0)
8725 rs = rb_rs;
8726 if (!NIL_P(opts)) {
8727 static ID keywords[1];
8728 if (!keywords[0]) {
8729 keywords[0] = rb_intern_const("chomp");
8731 rb_get_kwargs(opts, keywords, 0, 1, &chomp);
8732 chomp = (chomp != Qundef && RTEST(chomp));
8735 if (NIL_P(rs)) {
8736 if (!ENUM_ELEM(ary, str)) {
8737 return ary;
8739 else {
8740 return orig;
8744 if (!RSTRING_LEN(str)) goto end;
8745 str = rb_str_new_frozen(str);
8746 ptr = subptr = RSTRING_PTR(str);
8747 pend = RSTRING_END(str);
8748 len = RSTRING_LEN(str);
8749 StringValue(rs);
8750 rslen = RSTRING_LEN(rs);
8752 if (rs == rb_default_rs)
8753 enc = rb_enc_get(str);
8754 else
8755 enc = rb_enc_check(str, rs);
8757 if (rslen == 0) {
8758 /* paragraph mode */
8759 int n;
8760 const char *eol = NULL;
8761 subend = subptr;
8762 while (subend < pend) {
8763 do {
8764 if (rb_enc_ascget(subend, pend, &n, enc) != '\r')
8765 n = 0;
8766 rslen = n + rb_enc_mbclen(subend + n, pend, enc);
8767 if (rb_enc_is_newline(subend + n, pend, enc)) {
8768 if (eol == subend) break;
8769 subend += rslen;
8770 if (subptr) eol = subend;
8772 else {
8773 if (!subptr) subptr = subend;
8774 subend += rslen;
8776 rslen = 0;
8777 } while (subend < pend);
8778 if (!subptr) break;
8779 line = rb_str_subseq(str, subptr - ptr,
8780 subend - subptr + (chomp ? 0 : rslen));
8781 if (ENUM_ELEM(ary, line)) {
8782 str_mod_check(str, ptr, len);
8784 subptr = eol = NULL;
8786 goto end;
8788 else {
8789 rsptr = RSTRING_PTR(rs);
8790 if (RSTRING_LEN(rs) == rb_enc_mbminlen(enc) &&
8791 rb_enc_is_newline(rsptr, rsptr + RSTRING_LEN(rs), enc)) {
8792 rsnewline = 1;
8796 if ((rs == rb_default_rs) && !rb_enc_asciicompat(enc)) {
8797 rs = rb_str_new(rsptr, rslen);
8798 rs = rb_str_encode(rs, rb_enc_from_encoding(enc), 0, Qnil);
8799 rsptr = RSTRING_PTR(rs);
8800 rslen = RSTRING_LEN(rs);
8803 while (subptr < pend) {
8804 pos = rb_memsearch(rsptr, rslen, subptr, pend - subptr, enc);
8805 if (pos < 0) break;
8806 hit = subptr + pos;
8807 adjusted = rb_enc_right_char_head(subptr, hit, pend, enc);
8808 if (hit != adjusted) {
8809 subptr = adjusted;
8810 continue;
8812 subend = hit += rslen;
8813 if (chomp) {
8814 if (rsnewline) {
8815 subend = chomp_newline(subptr, subend, enc);
8817 else {
8818 subend -= rslen;
8821 line = rb_str_subseq(str, subptr - ptr, subend - subptr);
8822 if (ENUM_ELEM(ary, line)) {
8823 str_mod_check(str, ptr, len);
8825 subptr = hit;
8828 if (subptr != pend) {
8829 if (chomp) {
8830 if (rsnewline) {
8831 pend = chomp_newline(subptr, pend, enc);
8833 else if (pend - subptr >= rslen &&
8834 memcmp(pend - rslen, rsptr, rslen) == 0) {
8835 pend -= rslen;
8838 line = rb_str_subseq(str, subptr - ptr, pend - subptr);
8839 ENUM_ELEM(ary, line);
8840 RB_GC_GUARD(str);
8843 end:
8844 if (ary)
8845 return ary;
8846 else
8847 return orig;
8851 * call-seq:
8852 * str.each_line(separator=$/, chomp: false) {|substr| block } -> str
8853 * str.each_line(separator=$/, chomp: false) -> an_enumerator
8855 * Splits <i>str</i> using the supplied parameter as the record
8856 * separator (<code>$/</code> by default), passing each substring in
8857 * turn to the supplied block. If a zero-length record separator is
8858 * supplied, the string is split into paragraphs delimited by
8859 * multiple successive newlines.
8861 * If +chomp+ is +true+, +separator+ will be removed from the end of each
8862 * line.
8864 * If no block is given, an enumerator is returned instead.
8866 * "hello\nworld".each_line {|s| p s}
8867 * # prints:
8868 * # "hello\n"
8869 * # "world"
8871 * "hello\nworld".each_line('l') {|s| p s}
8872 * # prints:
8873 * # "hel"
8874 * # "l"
8875 * # "o\nworl"
8876 * # "d"
8878 * "hello\n\n\nworld".each_line('') {|s| p s}
8879 * # prints
8880 * # "hello\n\n"
8881 * # "world"
8883 * "hello\nworld".each_line(chomp: true) {|s| p s}
8884 * # prints:
8885 * # "hello"
8886 * # "world"
8888 * "hello\nworld".each_line('l', chomp: true) {|s| p s}
8889 * # prints:
8890 * # "he"
8891 * # ""
8892 * # "o\nwor"
8893 * # "d"
8897 static VALUE
8898 rb_str_each_line(int argc, VALUE *argv, VALUE str)
8900 RETURN_SIZED_ENUMERATOR(str, argc, argv, 0);
8901 return rb_str_enumerate_lines(argc, argv, str, 0);
8905 * call-seq:
8906 * str.lines(separator=$/, chomp: false) -> an_array
8908 * Returns an array of lines in <i>str</i> split using the supplied
8909 * record separator (<code>$/</code> by default). This is a
8910 * shorthand for <code>str.each_line(separator, getline_args).to_a</code>.
8912 * If +chomp+ is +true+, +separator+ will be removed from the end of each
8913 * line.
8915 * "hello\nworld\n".lines #=> ["hello\n", "world\n"]
8916 * "hello world".lines(' ') #=> ["hello ", " ", "world"]
8917 * "hello\nworld\n".lines(chomp: true) #=> ["hello", "world"]
8919 * If a block is given, which is a deprecated form, works the same as
8920 * <code>each_line</code>.
8923 static VALUE
8924 rb_str_lines(int argc, VALUE *argv, VALUE str)
8926 VALUE ary = WANTARRAY("lines", 0);
8927 return rb_str_enumerate_lines(argc, argv, str, ary);
8930 static VALUE
8931 rb_str_each_byte_size(VALUE str, VALUE args, VALUE eobj)
8933 return LONG2FIX(RSTRING_LEN(str));
8936 static VALUE
8937 rb_str_enumerate_bytes(VALUE str, VALUE ary)
8939 long i;
8941 for (i=0; i<RSTRING_LEN(str); i++) {
8942 ENUM_ELEM(ary, INT2FIX((unsigned char)RSTRING_PTR(str)[i]));
8944 if (ary)
8945 return ary;
8946 else
8947 return str;
8951 * call-seq:
8952 * str.each_byte {|integer| block } -> str
8953 * str.each_byte -> an_enumerator
8955 * Passes each byte in <i>str</i> to the given block, or returns an
8956 * enumerator if no block is given.
8958 * "hello".each_byte {|c| print c, ' ' }
8960 * <em>produces:</em>
8962 * 104 101 108 108 111
8965 static VALUE
8966 rb_str_each_byte(VALUE str)
8968 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_byte_size);
8969 return rb_str_enumerate_bytes(str, 0);
8973 * call-seq:
8974 * str.bytes -> an_array
8976 * Returns an array of bytes in <i>str</i>. This is a shorthand for
8977 * <code>str.each_byte.to_a</code>.
8979 * If a block is given, which is a deprecated form, works the same as
8980 * <code>each_byte</code>.
8983 static VALUE
8984 rb_str_bytes(VALUE str)
8986 VALUE ary = WANTARRAY("bytes", RSTRING_LEN(str));
8987 return rb_str_enumerate_bytes(str, ary);
8990 static VALUE
8991 rb_str_each_char_size(VALUE str, VALUE args, VALUE eobj)
8993 return rb_str_length(str);
8996 static VALUE
8997 rb_str_enumerate_chars(VALUE str, VALUE ary)
8999 VALUE orig = str;
9000 long i, len, n;
9001 const char *ptr;
9002 rb_encoding *enc;
9004 str = rb_str_new_frozen(str);
9005 ptr = RSTRING_PTR(str);
9006 len = RSTRING_LEN(str);
9007 enc = rb_enc_get(str);
9009 if (ENC_CODERANGE_CLEAN_P(ENC_CODERANGE(str))) {
9010 for (i = 0; i < len; i += n) {
9011 n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
9012 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9015 else {
9016 for (i = 0; i < len; i += n) {
9017 n = rb_enc_mbclen(ptr + i, ptr + len, enc);
9018 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9021 RB_GC_GUARD(str);
9022 if (ary)
9023 return ary;
9024 else
9025 return orig;
9029 * call-seq:
9030 * str.each_char {|cstr| block } -> str
9031 * str.each_char -> an_enumerator
9033 * Passes each character in <i>str</i> to the given block, or returns
9034 * an enumerator if no block is given.
9036 * "hello".each_char {|c| print c, ' ' }
9038 * <em>produces:</em>
9040 * h e l l o
9043 static VALUE
9044 rb_str_each_char(VALUE str)
9046 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9047 return rb_str_enumerate_chars(str, 0);
9051 * call-seq:
9052 * str.chars -> an_array
9054 * Returns an array of characters in <i>str</i>. This is a shorthand
9055 * for <code>str.each_char.to_a</code>.
9057 * If a block is given, which is a deprecated form, works the same as
9058 * <code>each_char</code>.
9061 static VALUE
9062 rb_str_chars(VALUE str)
9064 VALUE ary = WANTARRAY("chars", rb_str_strlen(str));
9065 return rb_str_enumerate_chars(str, ary);
9068 static VALUE
9069 rb_str_enumerate_codepoints(VALUE str, VALUE ary)
9071 VALUE orig = str;
9072 int n;
9073 unsigned int c;
9074 const char *ptr, *end;
9075 rb_encoding *enc;
9077 if (single_byte_optimizable(str))
9078 return rb_str_enumerate_bytes(str, ary);
9080 str = rb_str_new_frozen(str);
9081 ptr = RSTRING_PTR(str);
9082 end = RSTRING_END(str);
9083 enc = STR_ENC_GET(str);
9085 while (ptr < end) {
9086 c = rb_enc_codepoint_len(ptr, end, &n, enc);
9087 ENUM_ELEM(ary, UINT2NUM(c));
9088 ptr += n;
9090 RB_GC_GUARD(str);
9091 if (ary)
9092 return ary;
9093 else
9094 return orig;
9098 * call-seq:
9099 * str.each_codepoint {|integer| block } -> str
9100 * str.each_codepoint -> an_enumerator
9102 * Passes the Integer ordinal of each character in <i>str</i>,
9103 * also known as a <i>codepoint</i> when applied to Unicode strings to the
9104 * given block. For encodings other than UTF-8/UTF-16(BE|LE)/UTF-32(BE|LE),
9105 * values are directly derived from the binary representation
9106 * of each character.
9108 * If no block is given, an enumerator is returned instead.
9110 * "hello\u0639".each_codepoint {|c| print c, ' ' }
9112 * <em>produces:</em>
9114 * 104 101 108 108 111 1593
9117 static VALUE
9118 rb_str_each_codepoint(VALUE str)
9120 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9121 return rb_str_enumerate_codepoints(str, 0);
9125 * call-seq:
9126 * str.codepoints -> an_array
9128 * Returns an array of the Integer ordinals of the
9129 * characters in <i>str</i>. This is a shorthand for
9130 * <code>str.each_codepoint.to_a</code>.
9132 * If a block is given, which is a deprecated form, works the same as
9133 * <code>each_codepoint</code>.
9136 static VALUE
9137 rb_str_codepoints(VALUE str)
9139 VALUE ary = WANTARRAY("codepoints", rb_str_strlen(str));
9140 return rb_str_enumerate_codepoints(str, ary);
9143 static regex_t *
9144 get_reg_grapheme_cluster(rb_encoding *enc)
9146 int encidx = rb_enc_to_index(enc);
9147 regex_t *reg_grapheme_cluster = NULL;
9148 static regex_t *reg_grapheme_cluster_utf8 = NULL;
9150 /* synchronize */
9151 if (encidx == rb_utf8_encindex() && reg_grapheme_cluster_utf8) {
9152 reg_grapheme_cluster = reg_grapheme_cluster_utf8;
9154 if (!reg_grapheme_cluster) {
9155 const OnigUChar source_ascii[] = "\\X";
9156 OnigErrorInfo einfo;
9157 const OnigUChar *source = source_ascii;
9158 size_t source_len = sizeof(source_ascii) - 1;
9159 switch (encidx) {
9160 #define CHARS_16BE(x) (OnigUChar)((x)>>8), (OnigUChar)(x)
9161 #define CHARS_16LE(x) (OnigUChar)(x), (OnigUChar)((x)>>8)
9162 #define CHARS_32BE(x) CHARS_16BE((x)>>16), CHARS_16BE(x)
9163 #define CHARS_32LE(x) CHARS_16LE(x), CHARS_16LE((x)>>16)
9164 #define CASE_UTF(e) \
9165 case ENCINDEX_UTF_##e: { \
9166 static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \
9167 source = source_UTF_##e; \
9168 source_len = sizeof(source_UTF_##e); \
9169 break; \
9171 CASE_UTF(16BE); CASE_UTF(16LE); CASE_UTF(32BE); CASE_UTF(32LE);
9172 #undef CASE_UTF
9173 #undef CHARS_16BE
9174 #undef CHARS_16LE
9175 #undef CHARS_32BE
9176 #undef CHARS_32LE
9178 int r = onig_new(&reg_grapheme_cluster, source, source + source_len,
9179 ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, &einfo);
9180 if (r) {
9181 UChar message[ONIG_MAX_ERROR_MESSAGE_LEN];
9182 onig_error_code_to_str(message, r, &einfo);
9183 rb_fatal("cannot compile grapheme cluster regexp: %s", (char *)message);
9185 if (encidx == rb_utf8_encindex()) {
9186 reg_grapheme_cluster_utf8 = reg_grapheme_cluster;
9189 return reg_grapheme_cluster;
9192 static VALUE
9193 rb_str_each_grapheme_cluster_size(VALUE str, VALUE args, VALUE eobj)
9195 size_t grapheme_cluster_count = 0;
9196 regex_t *reg_grapheme_cluster = NULL;
9197 rb_encoding *enc = rb_enc_from_index(ENCODING_GET(str));
9198 const char *ptr, *end;
9200 if (!rb_enc_unicode_p(enc)) {
9201 return rb_str_length(str);
9204 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9205 ptr = RSTRING_PTR(str);
9206 end = RSTRING_END(str);
9208 while (ptr < end) {
9209 OnigPosition len = onig_match(reg_grapheme_cluster,
9210 (const OnigUChar *)ptr, (const OnigUChar *)end,
9211 (const OnigUChar *)ptr, NULL, 0);
9212 if (len <= 0) break;
9213 grapheme_cluster_count++;
9214 ptr += len;
9217 return SIZET2NUM(grapheme_cluster_count);
9220 static VALUE
9221 rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary)
9223 VALUE orig = str;
9224 regex_t *reg_grapheme_cluster = NULL;
9225 rb_encoding *enc = rb_enc_from_index(ENCODING_GET(str));
9226 const char *ptr0, *ptr, *end;
9228 if (!rb_enc_unicode_p(enc)) {
9229 return rb_str_enumerate_chars(str, ary);
9232 if (!ary) str = rb_str_new_frozen(str);
9233 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9234 ptr0 = ptr = RSTRING_PTR(str);
9235 end = RSTRING_END(str);
9237 while (ptr < end) {
9238 OnigPosition len = onig_match(reg_grapheme_cluster,
9239 (const OnigUChar *)ptr, (const OnigUChar *)end,
9240 (const OnigUChar *)ptr, NULL, 0);
9241 if (len <= 0) break;
9242 ENUM_ELEM(ary, rb_str_subseq(str, ptr-ptr0, len));
9243 ptr += len;
9245 RB_GC_GUARD(str);
9246 if (ary)
9247 return ary;
9248 else
9249 return orig;
9253 * call-seq:
9254 * str.each_grapheme_cluster {|cstr| block } -> str
9255 * str.each_grapheme_cluster -> an_enumerator
9257 * Passes each grapheme cluster in <i>str</i> to the given block, or returns
9258 * an enumerator if no block is given.
9259 * Unlike String#each_char, this enumerates by grapheme clusters defined by
9260 * Unicode Standard Annex #29 http://unicode.org/reports/tr29/
9262 * "a\u0300".each_char.to_a.size #=> 2
9263 * "a\u0300".each_grapheme_cluster.to_a.size #=> 1
9267 static VALUE
9268 rb_str_each_grapheme_cluster(VALUE str)
9270 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_grapheme_cluster_size);
9271 return rb_str_enumerate_grapheme_clusters(str, 0);
9275 * call-seq:
9276 * str.grapheme_clusters -> an_array
9278 * Returns an array of grapheme clusters in <i>str</i>. This is a shorthand
9279 * for <code>str.each_grapheme_cluster.to_a</code>.
9281 * If a block is given, which is a deprecated form, works the same as
9282 * <code>each_grapheme_cluster</code>.
9285 static VALUE
9286 rb_str_grapheme_clusters(VALUE str)
9288 VALUE ary = WANTARRAY("grapheme_clusters", rb_str_strlen(str));
9289 return rb_str_enumerate_grapheme_clusters(str, ary);
9292 static long
9293 chopped_length(VALUE str)
9295 rb_encoding *enc = STR_ENC_GET(str);
9296 const char *p, *p2, *beg, *end;
9298 beg = RSTRING_PTR(str);
9299 end = beg + RSTRING_LEN(str);
9300 if (beg >= end) return 0;
9301 p = rb_enc_prev_char(beg, end, end, enc);
9302 if (!p) return 0;
9303 if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
9304 p2 = rb_enc_prev_char(beg, p, end, enc);
9305 if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
9307 return p - beg;
9311 * call-seq:
9312 * str.chop! -> str or nil
9314 * Processes <i>str</i> as for String#chop, returning <i>str</i>, or
9315 * <code>nil</code> if <i>str</i> is the empty string. See also
9316 * String#chomp!.
9319 static VALUE
9320 rb_str_chop_bang(VALUE str)
9322 str_modify_keep_cr(str);
9323 if (RSTRING_LEN(str) > 0) {
9324 long len;
9325 len = chopped_length(str);
9326 STR_SET_LEN(str, len);
9327 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
9328 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
9329 ENC_CODERANGE_CLEAR(str);
9331 return str;
9333 return Qnil;
9338 * call-seq:
9339 * str.chop -> new_str
9341 * Returns a new String with the last character removed. If the
9342 * string ends with <code>\r\n</code>, both characters are
9343 * removed. Applying <code>chop</code> to an empty string returns an
9344 * empty string. String#chomp is often a safer alternative, as it
9345 * leaves the string unchanged if it doesn't end in a record
9346 * separator.
9348 * "string\r\n".chop #=> "string"
9349 * "string\n\r".chop #=> "string\n"
9350 * "string\n".chop #=> "string"
9351 * "string".chop #=> "strin"
9352 * "x".chop.chop #=> ""
9355 static VALUE
9356 rb_str_chop(VALUE str)
9358 return rb_str_subseq(str, 0, chopped_length(str));
9361 static long
9362 smart_chomp(VALUE str, const char *e, const char *p)
9364 rb_encoding *enc = rb_enc_get(str);
9365 if (rb_enc_mbminlen(enc) > 1) {
9366 const char *pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
9367 if (rb_enc_is_newline(pp, e, enc)) {
9368 e = pp;
9370 pp = e - rb_enc_mbminlen(enc);
9371 if (pp >= p) {
9372 pp = rb_enc_left_char_head(p, pp, e, enc);
9373 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
9374 e = pp;
9378 else {
9379 switch (*(e-1)) { /* not e[-1] to get rid of VC bug */
9380 case '\n':
9381 if (--e > p && *(e-1) == '\r') {
9382 --e;
9384 break;
9385 case '\r':
9386 --e;
9387 break;
9390 return e - p;
9393 static long
9394 chompped_length(VALUE str, VALUE rs)
9396 rb_encoding *enc;
9397 int newline;
9398 char *pp, *e, *rsptr;
9399 long rslen;
9400 char *const p = RSTRING_PTR(str);
9401 long len = RSTRING_LEN(str);
9403 if (len == 0) return 0;
9404 e = p + len;
9405 if (rs == rb_default_rs) {
9406 return smart_chomp(str, e, p);
9409 enc = rb_enc_get(str);
9410 RSTRING_GETMEM(rs, rsptr, rslen);
9411 if (rslen == 0) {
9412 if (rb_enc_mbminlen(enc) > 1) {
9413 while (e > p) {
9414 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
9415 if (!rb_enc_is_newline(pp, e, enc)) break;
9416 e = pp;
9417 pp -= rb_enc_mbminlen(enc);
9418 if (pp >= p) {
9419 pp = rb_enc_left_char_head(p, pp, e, enc);
9420 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
9421 e = pp;
9426 else {
9427 while (e > p && *(e-1) == '\n') {
9428 --e;
9429 if (e > p && *(e-1) == '\r')
9430 --e;
9433 return e - p;
9435 if (rslen > len) return len;
9437 enc = rb_enc_get(rs);
9438 newline = rsptr[rslen-1];
9439 if (rslen == rb_enc_mbminlen(enc)) {
9440 if (rslen == 1) {
9441 if (newline == '\n')
9442 return smart_chomp(str, e, p);
9444 else {
9445 if (rb_enc_is_newline(rsptr, rsptr+rslen, enc))
9446 return smart_chomp(str, e, p);
9450 enc = rb_enc_check(str, rs);
9451 if (is_broken_string(rs)) {
9452 return len;
9454 pp = e - rslen;
9455 if (p[len-1] == newline &&
9456 (rslen <= 1 ||
9457 memcmp(rsptr, pp, rslen) == 0)) {
9458 if (rb_enc_left_char_head(p, pp, e, enc) == pp)
9459 return len - rslen;
9460 RB_GC_GUARD(rs);
9462 return len;
9466 * Returns the separator for arguments of rb_str_chomp.
9468 * @return returns rb_rs ($/) as default, the default value of rb_rs ($/) is "\n".
9470 static VALUE
9471 chomp_rs(int argc, const VALUE *argv)
9473 rb_check_arity(argc, 0, 1);
9474 if (argc > 0) {
9475 VALUE rs = argv[0];
9476 if (!NIL_P(rs)) StringValue(rs);
9477 return rs;
9479 else {
9480 return rb_rs;
9484 VALUE
9485 rb_str_chomp_string(VALUE str, VALUE rs)
9487 long olen = RSTRING_LEN(str);
9488 long len = chompped_length(str, rs);
9489 if (len >= olen) return Qnil;
9490 str_modify_keep_cr(str);
9491 STR_SET_LEN(str, len);
9492 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
9493 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
9494 ENC_CODERANGE_CLEAR(str);
9496 return str;
9500 * call-seq:
9501 * str.chomp!(separator=$/) -> str or nil
9503 * Modifies <i>str</i> in place as described for String#chomp,
9504 * returning <i>str</i>, or <code>nil</code> if no modifications were
9505 * made.
9508 static VALUE
9509 rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
9511 VALUE rs;
9512 str_modifiable(str);
9513 if (RSTRING_LEN(str) == 0) return Qnil;
9514 rs = chomp_rs(argc, argv);
9515 if (NIL_P(rs)) return Qnil;
9516 return rb_str_chomp_string(str, rs);
9521 * call-seq:
9522 * str.chomp(separator=$/) -> new_str
9524 * Returns a new String with the given record separator removed
9525 * from the end of <i>str</i> (if present). If <code>$/</code> has not been
9526 * changed from the default Ruby record separator, then <code>chomp</code> also
9527 * removes carriage return characters (that is, it will remove <code>\n</code>,
9528 * <code>\r</code>, and <code>\r\n</code>). If <code>$/</code> is an empty string,
9529 * it will remove all trailing newlines from the string.
9531 * "hello".chomp #=> "hello"
9532 * "hello\n".chomp #=> "hello"
9533 * "hello\r\n".chomp #=> "hello"
9534 * "hello\n\r".chomp #=> "hello\n"
9535 * "hello\r".chomp #=> "hello"
9536 * "hello \n there".chomp #=> "hello \n there"
9537 * "hello".chomp("llo") #=> "he"
9538 * "hello\r\n\r\n".chomp('') #=> "hello"
9539 * "hello\r\n\r\r\n".chomp('') #=> "hello\r\n\r"
9542 static VALUE
9543 rb_str_chomp(int argc, VALUE *argv, VALUE str)
9545 VALUE rs = chomp_rs(argc, argv);
9546 if (NIL_P(rs)) return str_duplicate(rb_cString, str);
9547 return rb_str_subseq(str, 0, chompped_length(str, rs));
9550 static long
9551 lstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
9553 const char *const start = s;
9555 if (!s || s >= e) return 0;
9557 /* remove spaces at head */
9558 if (single_byte_optimizable(str)) {
9559 while (s < e && (*s == '\0' || ascii_isspace(*s))) s++;
9561 else {
9562 while (s < e) {
9563 int n;
9564 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
9566 if (cc && !rb_isspace(cc)) break;
9567 s += n;
9570 return s - start;
9574 * call-seq:
9575 * str.lstrip! -> self or nil
9577 * Removes leading whitespace from the receiver.
9578 * Returns the altered receiver, or +nil+ if no change was made.
9579 * See also String#rstrip! and String#strip!.
9581 * Refer to String#strip for the definition of whitespace.
9583 * " hello ".lstrip! #=> "hello "
9584 * "hello ".lstrip! #=> nil
9585 * "hello".lstrip! #=> nil
9588 static VALUE
9589 rb_str_lstrip_bang(VALUE str)
9591 rb_encoding *enc;
9592 char *start, *s;
9593 long olen, loffset;
9595 str_modify_keep_cr(str);
9596 enc = STR_ENC_GET(str);
9597 RSTRING_GETMEM(str, start, olen);
9598 loffset = lstrip_offset(str, start, start+olen, enc);
9599 if (loffset > 0) {
9600 long len = olen-loffset;
9601 s = start + loffset;
9602 memmove(start, s, len);
9603 STR_SET_LEN(str, len);
9604 TERM_FILL(start+len, rb_enc_mbminlen(enc));
9605 return str;
9607 return Qnil;
9612 * call-seq:
9613 * str.lstrip -> new_str
9615 * Returns a copy of the receiver with leading whitespace removed.
9616 * See also String#rstrip and String#strip.
9618 * Refer to String#strip for the definition of whitespace.
9620 * " hello ".lstrip #=> "hello "
9621 * "hello".lstrip #=> "hello"
9624 static VALUE
9625 rb_str_lstrip(VALUE str)
9627 char *start;
9628 long len, loffset;
9629 RSTRING_GETMEM(str, start, len);
9630 loffset = lstrip_offset(str, start, start+len, STR_ENC_GET(str));
9631 if (loffset <= 0) return str_duplicate(rb_cString, str);
9632 return rb_str_subseq(str, loffset, len - loffset);
9635 static long
9636 rstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
9638 const char *t;
9640 rb_str_check_dummy_enc(enc);
9641 if (!s || s >= e) return 0;
9642 t = e;
9644 /* remove trailing spaces or '\0's */
9645 if (single_byte_optimizable(str)) {
9646 unsigned char c;
9647 while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--;
9649 else {
9650 char *tp;
9652 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
9653 unsigned int c = rb_enc_codepoint(tp, e, enc);
9654 if (c && !rb_isspace(c)) break;
9655 t = tp;
9658 return e - t;
9662 * call-seq:
9663 * str.rstrip! -> self or nil
9665 * Removes trailing whitespace from the receiver.
9666 * Returns the altered receiver, or +nil+ if no change was made.
9667 * See also String#lstrip! and String#strip!.
9669 * Refer to String#strip for the definition of whitespace.
9671 * " hello ".rstrip! #=> " hello"
9672 * " hello".rstrip! #=> nil
9673 * "hello".rstrip! #=> nil
9676 static VALUE
9677 rb_str_rstrip_bang(VALUE str)
9679 rb_encoding *enc;
9680 char *start;
9681 long olen, roffset;
9683 str_modify_keep_cr(str);
9684 enc = STR_ENC_GET(str);
9685 RSTRING_GETMEM(str, start, olen);
9686 roffset = rstrip_offset(str, start, start+olen, enc);
9687 if (roffset > 0) {
9688 long len = olen - roffset;
9690 STR_SET_LEN(str, len);
9691 TERM_FILL(start+len, rb_enc_mbminlen(enc));
9692 return str;
9694 return Qnil;
9699 * call-seq:
9700 * str.rstrip -> new_str
9702 * Returns a copy of the receiver with trailing whitespace removed.
9703 * See also String#lstrip and String#strip.
9705 * Refer to String#strip for the definition of whitespace.
9707 * " hello ".rstrip #=> " hello"
9708 * "hello".rstrip #=> "hello"
9711 static VALUE
9712 rb_str_rstrip(VALUE str)
9714 rb_encoding *enc;
9715 char *start;
9716 long olen, roffset;
9718 enc = STR_ENC_GET(str);
9719 RSTRING_GETMEM(str, start, olen);
9720 roffset = rstrip_offset(str, start, start+olen, enc);
9722 if (roffset <= 0) return str_duplicate(rb_cString, str);
9723 return rb_str_subseq(str, 0, olen-roffset);
9728 * call-seq:
9729 * str.strip! -> self or nil
9731 * Removes leading and trailing whitespace from the receiver.
9732 * Returns the altered receiver, or +nil+ if there was no change.
9734 * Refer to String#strip for the definition of whitespace.
9736 * " hello ".strip! #=> "hello"
9737 * "hello".strip! #=> nil
9740 static VALUE
9741 rb_str_strip_bang(VALUE str)
9743 char *start;
9744 long olen, loffset, roffset;
9745 rb_encoding *enc;
9747 str_modify_keep_cr(str);
9748 enc = STR_ENC_GET(str);
9749 RSTRING_GETMEM(str, start, olen);
9750 loffset = lstrip_offset(str, start, start+olen, enc);
9751 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
9753 if (loffset > 0 || roffset > 0) {
9754 long len = olen-roffset;
9755 if (loffset > 0) {
9756 len -= loffset;
9757 memmove(start, start + loffset, len);
9759 STR_SET_LEN(str, len);
9760 TERM_FILL(start+len, rb_enc_mbminlen(enc));
9761 return str;
9763 return Qnil;
9768 * call-seq:
9769 * str.strip -> new_str
9771 * Returns a copy of the receiver with leading and trailing whitespace removed.
9773 * Whitespace is defined as any of the following characters:
9774 * null, horizontal tab, line feed, vertical tab, form feed, carriage return, space.
9776 * " hello ".strip #=> "hello"
9777 * "\tgoodbye\r\n".strip #=> "goodbye"
9778 * "\x00\t\n\v\f\r ".strip #=> ""
9779 * "hello".strip #=> "hello"
9782 static VALUE
9783 rb_str_strip(VALUE str)
9785 char *start;
9786 long olen, loffset, roffset;
9787 rb_encoding *enc = STR_ENC_GET(str);
9789 RSTRING_GETMEM(str, start, olen);
9790 loffset = lstrip_offset(str, start, start+olen, enc);
9791 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
9793 if (loffset <= 0 && roffset <= 0) return str_duplicate(rb_cString, str);
9794 return rb_str_subseq(str, loffset, olen-loffset-roffset);
9797 static VALUE
9798 scan_once(VALUE str, VALUE pat, long *start, int set_backref_str)
9800 VALUE result, match;
9801 struct re_registers *regs;
9802 int i;
9803 long end, pos = rb_pat_search(pat, str, *start, set_backref_str);
9804 if (pos >= 0) {
9805 if (BUILTIN_TYPE(pat) == T_STRING) {
9806 regs = NULL;
9807 end = pos + RSTRING_LEN(pat);
9809 else {
9810 match = rb_backref_get();
9811 regs = RMATCH_REGS(match);
9812 pos = BEG(0);
9813 end = END(0);
9815 if (pos == end) {
9816 rb_encoding *enc = STR_ENC_GET(str);
9818 * Always consume at least one character of the input string
9820 if (RSTRING_LEN(str) > end)
9821 *start = end + rb_enc_fast_mbclen(RSTRING_PTR(str) + end,
9822 RSTRING_END(str), enc);
9823 else
9824 *start = end + 1;
9826 else {
9827 *start = end;
9829 if (!regs || regs->num_regs == 1) {
9830 result = rb_str_subseq(str, pos, end - pos);
9831 return result;
9833 result = rb_ary_new2(regs->num_regs);
9834 for (i=1; i < regs->num_regs; i++) {
9835 VALUE s = Qnil;
9836 if (BEG(i) >= 0) {
9837 s = rb_str_subseq(str, BEG(i), END(i)-BEG(i));
9839 rb_ary_push(result, s);
9842 return result;
9844 return Qnil;
9849 * call-seq:
9850 * str.scan(pattern) -> array
9851 * str.scan(pattern) {|match, ...| block } -> str
9853 * Both forms iterate through <i>str</i>, matching the pattern (which may be a
9854 * Regexp or a String). For each match, a result is
9855 * generated and either added to the result array or passed to the block. If
9856 * the pattern contains no groups, each individual result consists of the
9857 * matched string, <code>$&</code>. If the pattern contains groups, each
9858 * individual result is itself an array containing one entry per group.
9860 * a = "cruel world"
9861 * a.scan(/\w+/) #=> ["cruel", "world"]
9862 * a.scan(/.../) #=> ["cru", "el ", "wor"]
9863 * a.scan(/(...)/) #=> [["cru"], ["el "], ["wor"]]
9864 * a.scan(/(..)(..)/) #=> [["cr", "ue"], ["l ", "wo"]]
9866 * And the block form:
9868 * a.scan(/\w+/) {|w| print "<<#{w}>> " }
9869 * print "\n"
9870 * a.scan(/(.)(.)/) {|x,y| print y, x }
9871 * print "\n"
9873 * <em>produces:</em>
9875 * <<cruel>> <<world>>
9876 * rceu lowlr
9879 static VALUE
9880 rb_str_scan(VALUE str, VALUE pat)
9882 VALUE result;
9883 long start = 0;
9884 long last = -1, prev = 0;
9885 char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
9887 pat = get_pat_quoted(pat, 1);
9888 mustnot_broken(str);
9889 if (!rb_block_given_p()) {
9890 VALUE ary = rb_ary_new();
9892 while (!NIL_P(result = scan_once(str, pat, &start, 0))) {
9893 last = prev;
9894 prev = start;
9895 rb_ary_push(ary, result);
9897 if (last >= 0) rb_pat_search(pat, str, last, 1);
9898 else rb_backref_set(Qnil);
9899 return ary;
9902 while (!NIL_P(result = scan_once(str, pat, &start, 1))) {
9903 last = prev;
9904 prev = start;
9905 rb_yield(result);
9906 str_mod_check(str, p, len);
9908 if (last >= 0) rb_pat_search(pat, str, last, 1);
9909 return str;
9914 * call-seq:
9915 * str.hex -> integer
9917 * Treats leading characters from <i>str</i> as a string of hexadecimal digits
9918 * (with an optional sign and an optional <code>0x</code>) and returns the
9919 * corresponding number. Zero is returned on error.
9921 * "0x0a".hex #=> 10
9922 * "-1234".hex #=> -4660
9923 * "0".hex #=> 0
9924 * "wombat".hex #=> 0
9927 static VALUE
9928 rb_str_hex(VALUE str)
9930 return rb_str_to_inum(str, 16, FALSE);
9935 * call-seq:
9936 * str.oct -> integer
9938 * Treats leading characters of <i>str</i> as a string of octal digits (with an
9939 * optional sign) and returns the corresponding number. Returns 0 if the
9940 * conversion fails.
9942 * "123".oct #=> 83
9943 * "-377".oct #=> -255
9944 * "bad".oct #=> 0
9945 * "0377bad".oct #=> 255
9947 * If +str+ starts with <code>0</code>, radix indicators are honored.
9948 * See Kernel#Integer.
9951 static VALUE
9952 rb_str_oct(VALUE str)
9954 return rb_str_to_inum(str, -8, FALSE);
9957 #ifndef HAVE_CRYPT_R
9958 # include "ruby/thread_native.h"
9959 # include "ruby/atomic.h"
9961 static struct {
9962 rb_atomic_t initialized;
9963 rb_nativethread_lock_t lock;
9964 } crypt_mutex;
9966 static void
9967 crypt_mutex_destroy(void)
9969 RUBY_ASSERT_ALWAYS(crypt_mutex.initialized == 1);
9970 rb_nativethread_lock_destroy(&crypt_mutex.lock);
9971 crypt_mutex.initialized = 0;
9974 static void
9975 crypt_mutex_initialize(void)
9977 rb_atomic_t i;
9978 while ((i = RUBY_ATOMIC_CAS(crypt_mutex.initialized, 0, 2)) == 2);
9979 switch (i) {
9980 case 0:
9981 rb_nativethread_lock_initialize(&crypt_mutex.lock);
9982 atexit(crypt_mutex_destroy);
9983 RUBY_ASSERT(crypt_mutex.initialized == 2);
9984 RUBY_ATOMIC_CAS(crypt_mutex.initialized, 2, 1);
9985 break;
9986 case 1:
9987 break;
9988 default:
9989 rb_bug("crypt_mutex.initialized: %d->%d", i, crypt_mutex.initialized);
9992 #endif
9995 * call-seq:
9996 * str.crypt(salt_str) -> new_str
9998 * Returns the string generated by calling <code>crypt(3)</code>
9999 * standard library function with <code>str</code> and
10000 * <code>salt_str</code>, in this order, as its arguments. Please do
10001 * not use this method any longer. It is legacy; provided only for
10002 * backward compatibility with ruby scripts in earlier days. It is
10003 * bad to use in contemporary programs for several reasons:
10005 * * Behaviour of C's <code>crypt(3)</code> depends on the OS it is
10006 * run. The generated string lacks data portability.
10008 * * On some OSes such as Mac OS, <code>crypt(3)</code> never fails
10009 * (i.e. silently ends up in unexpected results).
10011 * * On some OSes such as Mac OS, <code>crypt(3)</code> is not
10012 * thread safe.
10014 * * So-called "traditional" usage of <code>crypt(3)</code> is very
10015 * very very weak. According to its manpage, Linux's traditional
10016 * <code>crypt(3)</code> output has only 2**56 variations; too
10017 * easy to brute force today. And this is the default behaviour.
10019 * * In order to make things robust some OSes implement so-called
10020 * "modular" usage. To go through, you have to do a complex
10021 * build-up of the <code>salt_str</code> parameter, by hand.
10022 * Failure in generation of a proper salt string tends not to
10023 * yield any errors; typos in parameters are normally not
10024 * detectable.
10026 * * For instance, in the following example, the second invocation
10027 * of String#crypt is wrong; it has a typo in "round=" (lacks
10028 * "s"). However the call does not fail and something unexpected
10029 * is generated.
10031 * "foo".crypt("$5$rounds=1000$salt$") # OK, proper usage
10032 * "foo".crypt("$5$round=1000$salt$") # Typo not detected
10034 * * Even in the "modular" mode, some hash functions are considered
10035 * archaic and no longer recommended at all; for instance module
10036 * <code>$1$</code> is officially abandoned by its author: see
10037 * http://phk.freebsd.dk/sagas/md5crypt_eol/ . For another
10038 * instance module <code>$3$</code> is considered completely
10039 * broken: see the manpage of FreeBSD.
10041 * * On some OS such as Mac OS, there is no modular mode. Yet, as
10042 * written above, <code>crypt(3)</code> on Mac OS never fails.
10043 * This means even if you build up a proper salt string it
10044 * generates a traditional DES hash anyways, and there is no way
10045 * for you to be aware of.
10047 * "foo".crypt("$5$rounds=1000$salt$") # => "$5fNPQMxC5j6."
10049 * If for some reason you cannot migrate to other secure contemporary
10050 * password hashing algorithms, install the string-crypt gem and
10051 * <code>require 'string/crypt'</code> to continue using it.
10054 static VALUE
10055 rb_str_crypt(VALUE str, VALUE salt)
10057 #ifdef HAVE_CRYPT_R
10058 VALUE databuf;
10059 struct crypt_data *data;
10060 # define CRYPT_END() ALLOCV_END(databuf)
10061 #else
10062 extern char *crypt(const char *, const char *);
10063 # define CRYPT_END() rb_nativethread_lock_unlock(&crypt_mutex.lock)
10064 #endif
10065 VALUE result;
10066 const char *s, *saltp;
10067 char *res;
10068 #ifdef BROKEN_CRYPT
10069 char salt_8bit_clean[3];
10070 #endif
10072 StringValue(salt);
10073 mustnot_wchar(str);
10074 mustnot_wchar(salt);
10075 s = StringValueCStr(str);
10076 saltp = RSTRING_PTR(salt);
10077 if (RSTRING_LEN(salt) < 2 || !saltp[0] || !saltp[1]) {
10078 rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
10081 #ifdef BROKEN_CRYPT
10082 if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) {
10083 salt_8bit_clean[0] = saltp[0] & 0x7f;
10084 salt_8bit_clean[1] = saltp[1] & 0x7f;
10085 salt_8bit_clean[2] = '\0';
10086 saltp = salt_8bit_clean;
10088 #endif
10089 #ifdef HAVE_CRYPT_R
10090 data = ALLOCV(databuf, sizeof(struct crypt_data));
10091 # ifdef HAVE_STRUCT_CRYPT_DATA_INITIALIZED
10092 data->initialized = 0;
10093 # endif
10094 res = crypt_r(s, saltp, data);
10095 #else
10096 crypt_mutex_initialize();
10097 rb_nativethread_lock_lock(&crypt_mutex.lock);
10098 res = crypt(s, saltp);
10099 #endif
10100 if (!res) {
10101 int err = errno;
10102 CRYPT_END();
10103 rb_syserr_fail(err, "crypt");
10105 result = rb_str_new_cstr(res);
10106 CRYPT_END();
10107 return result;
10112 * call-seq:
10113 * str.ord -> integer
10115 * Returns the Integer ordinal of a one-character string.
10117 * "a".ord #=> 97
10120 static VALUE
10121 rb_str_ord(VALUE s)
10123 unsigned int c;
10125 c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s));
10126 return UINT2NUM(c);
10129 * call-seq:
10130 * str.sum(n=16) -> integer
10132 * Returns a basic <em>n</em>-bit checksum of the characters in <i>str</i>,
10133 * where <em>n</em> is the optional Integer parameter, defaulting
10134 * to 16. The result is simply the sum of the binary value of each byte in
10135 * <i>str</i> modulo <code>2**n - 1</code>. This is not a particularly good
10136 * checksum.
10139 static VALUE
10140 rb_str_sum(int argc, VALUE *argv, VALUE str)
10142 int bits = 16;
10143 char *ptr, *p, *pend;
10144 long len;
10145 VALUE sum = INT2FIX(0);
10146 unsigned long sum0 = 0;
10148 if (rb_check_arity(argc, 0, 1) && (bits = NUM2INT(argv[0])) < 0) {
10149 bits = 0;
10151 ptr = p = RSTRING_PTR(str);
10152 len = RSTRING_LEN(str);
10153 pend = p + len;
10155 while (p < pend) {
10156 if (FIXNUM_MAX - UCHAR_MAX < sum0) {
10157 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10158 str_mod_check(str, ptr, len);
10159 sum0 = 0;
10161 sum0 += (unsigned char)*p;
10162 p++;
10165 if (bits == 0) {
10166 if (sum0) {
10167 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10170 else {
10171 if (sum == INT2FIX(0)) {
10172 if (bits < (int)sizeof(long)*CHAR_BIT) {
10173 sum0 &= (((unsigned long)1)<<bits)-1;
10175 sum = LONG2FIX(sum0);
10177 else {
10178 VALUE mod;
10180 if (sum0) {
10181 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10184 mod = rb_funcall(INT2FIX(1), idLTLT, 1, INT2FIX(bits));
10185 mod = rb_funcall(mod, '-', 1, INT2FIX(1));
10186 sum = rb_funcall(sum, '&', 1, mod);
10189 return sum;
10192 static VALUE
10193 rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
10195 rb_encoding *enc;
10196 VALUE w;
10197 long width, len, flen = 1, fclen = 1;
10198 VALUE res;
10199 char *p;
10200 const char *f = " ";
10201 long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
10202 VALUE pad;
10203 int singlebyte = 1, cr;
10204 int termlen;
10206 rb_scan_args(argc, argv, "11", &w, &pad);
10207 enc = STR_ENC_GET(str);
10208 termlen = rb_enc_mbminlen(enc);
10209 width = NUM2LONG(w);
10210 if (argc == 2) {
10211 StringValue(pad);
10212 enc = rb_enc_check(str, pad);
10213 f = RSTRING_PTR(pad);
10214 flen = RSTRING_LEN(pad);
10215 fclen = str_strlen(pad, enc); /* rb_enc_check */
10216 singlebyte = single_byte_optimizable(pad);
10217 if (flen == 0 || fclen == 0) {
10218 rb_raise(rb_eArgError, "zero width padding");
10221 len = str_strlen(str, enc); /* rb_enc_check */
10222 if (width < 0 || len >= width) return str_duplicate(rb_cString, str);
10223 n = width - len;
10224 llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
10225 rlen = n - llen;
10226 cr = ENC_CODERANGE(str);
10227 if (flen > 1) {
10228 llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
10229 rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
10231 size = RSTRING_LEN(str);
10232 if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
10233 (len *= flen) >= LONG_MAX - llen2 - rlen2 ||
10234 (len += llen2 + rlen2) >= LONG_MAX - size) {
10235 rb_raise(rb_eArgError, "argument too big");
10237 len += size;
10238 res = str_new0(rb_cString, 0, len, termlen);
10239 p = RSTRING_PTR(res);
10240 if (flen <= 1) {
10241 memset(p, *f, llen);
10242 p += llen;
10244 else {
10245 while (llen >= fclen) {
10246 memcpy(p,f,flen);
10247 p += flen;
10248 llen -= fclen;
10250 if (llen > 0) {
10251 memcpy(p, f, llen2);
10252 p += llen2;
10255 memcpy(p, RSTRING_PTR(str), size);
10256 p += size;
10257 if (flen <= 1) {
10258 memset(p, *f, rlen);
10259 p += rlen;
10261 else {
10262 while (rlen >= fclen) {
10263 memcpy(p,f,flen);
10264 p += flen;
10265 rlen -= fclen;
10267 if (rlen > 0) {
10268 memcpy(p, f, rlen2);
10269 p += rlen2;
10272 TERM_FILL(p, termlen);
10273 STR_SET_LEN(res, p-RSTRING_PTR(res));
10274 rb_enc_associate(res, enc);
10275 if (argc == 2)
10276 cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
10277 if (cr != ENC_CODERANGE_BROKEN)
10278 ENC_CODERANGE_SET(res, cr);
10280 RB_GC_GUARD(pad);
10281 return res;
10286 * call-seq:
10287 * str.ljust(integer, padstr=' ') -> new_str
10289 * If <i>integer</i> is greater than the length of <i>str</i>, returns a new
10290 * String of length <i>integer</i> with <i>str</i> left justified
10291 * and padded with <i>padstr</i>; otherwise, returns <i>str</i>.
10293 * "hello".ljust(4) #=> "hello"
10294 * "hello".ljust(20) #=> "hello "
10295 * "hello".ljust(20, '1234') #=> "hello123412341234123"
10298 static VALUE
10299 rb_str_ljust(int argc, VALUE *argv, VALUE str)
10301 return rb_str_justify(argc, argv, str, 'l');
10306 * call-seq:
10307 * str.rjust(integer, padstr=' ') -> new_str
10309 * If <i>integer</i> is greater than the length of <i>str</i>, returns a new
10310 * String of length <i>integer</i> with <i>str</i> right justified
10311 * and padded with <i>padstr</i>; otherwise, returns <i>str</i>.
10313 * "hello".rjust(4) #=> "hello"
10314 * "hello".rjust(20) #=> " hello"
10315 * "hello".rjust(20, '1234') #=> "123412341234123hello"
10318 static VALUE
10319 rb_str_rjust(int argc, VALUE *argv, VALUE str)
10321 return rb_str_justify(argc, argv, str, 'r');
10326 * call-seq:
10327 * str.center(width, padstr=' ') -> new_str
10329 * Centers +str+ in +width+. If +width+ is greater than the length of +str+,
10330 * returns a new String of length +width+ with +str+ centered and padded with
10331 * +padstr+; otherwise, returns +str+.
10333 * "hello".center(4) #=> "hello"
10334 * "hello".center(20) #=> " hello "
10335 * "hello".center(20, '123') #=> "1231231hello12312312"
10338 static VALUE
10339 rb_str_center(int argc, VALUE *argv, VALUE str)
10341 return rb_str_justify(argc, argv, str, 'c');
10345 * call-seq:
10346 * str.partition(sep) -> [head, sep, tail]
10347 * str.partition(regexp) -> [head, match, tail]
10349 * Searches <i>sep</i> or pattern (<i>regexp</i>) in the string
10350 * and returns the part before it, the match, and the part
10351 * after it.
10352 * If it is not found, returns two empty strings and <i>str</i>.
10354 * "hello".partition("l") #=> ["he", "l", "lo"]
10355 * "hello".partition("x") #=> ["hello", "", ""]
10356 * "hello".partition(/.l/) #=> ["h", "el", "lo"]
10359 static VALUE
10360 rb_str_partition(VALUE str, VALUE sep)
10362 long pos;
10364 sep = get_pat_quoted(sep, 0);
10365 if (RB_TYPE_P(sep, T_REGEXP)) {
10366 if (rb_reg_search(sep, str, 0, 0) < 0) {
10367 goto failed;
10369 VALUE match = rb_backref_get();
10370 struct re_registers *regs = RMATCH_REGS(match);
10372 pos = BEG(0);
10373 sep = rb_str_subseq(str, pos, END(0) - pos);
10375 else {
10376 pos = rb_str_index(str, sep, 0);
10377 if (pos < 0) goto failed;
10379 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
10380 sep,
10381 rb_str_subseq(str, pos+RSTRING_LEN(sep),
10382 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
10384 failed:
10385 return rb_ary_new3(3, str_duplicate(rb_cString, str), str_new_empty_String(str), str_new_empty_String(str));
10389 * call-seq:
10390 * str.rpartition(sep) -> [head, sep, tail]
10391 * str.rpartition(regexp) -> [head, match, tail]
10393 * Searches <i>sep</i> or pattern (<i>regexp</i>) in the string from the end
10394 * of the string, and returns the part before it, the match, and the part
10395 * after it.
10396 * If it is not found, returns two empty strings and <i>str</i>.
10398 * "hello".rpartition("l") #=> ["hel", "l", "o"]
10399 * "hello".rpartition("x") #=> ["", "", "hello"]
10400 * "hello".rpartition(/.l/) #=> ["he", "ll", "o"]
10402 * The match from the end means starting at the possible last position, not
10403 * the last of longest matches.
10405 * "hello".rpartition(/l+/) #=> ["hel", "l", "o"]
10407 * To partition at the last longest match, needs to combine with
10408 * negative lookbehind.
10410 * "hello".rpartition(/(?<!l)l+/) #=> ["he", "ll", "o"]
10412 * Or String#partition with negative lookforward.
10414 * "hello".partition(/l+(?!.*l)/) #=> ["he", "ll", "o"]
10417 static VALUE
10418 rb_str_rpartition(VALUE str, VALUE sep)
10420 long pos = RSTRING_LEN(str);
10422 sep = get_pat_quoted(sep, 0);
10423 if (RB_TYPE_P(sep, T_REGEXP)) {
10424 if (rb_reg_search(sep, str, pos, 1) < 0) {
10425 goto failed;
10427 VALUE match = rb_backref_get();
10428 struct re_registers *regs = RMATCH_REGS(match);
10430 pos = BEG(0);
10431 sep = rb_str_subseq(str, pos, END(0) - pos);
10433 else {
10434 pos = rb_str_sublen(str, pos);
10435 pos = rb_str_rindex(str, sep, pos);
10436 if (pos < 0) {
10437 goto failed;
10439 pos = rb_str_offset(str, pos);
10442 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
10443 sep,
10444 rb_str_subseq(str, pos+RSTRING_LEN(sep),
10445 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
10446 failed:
10447 return rb_ary_new3(3, str_new_empty_String(str), str_new_empty_String(str), str_duplicate(rb_cString, str));
10451 * call-seq:
10452 * str.start_with?([prefixes]+) -> true or false
10454 * Returns true if +str+ starts with one of the +prefixes+ given.
10455 * Each of the +prefixes+ should be a String or a Regexp.
10457 * "hello".start_with?("hell") #=> true
10458 * "hello".start_with?(/H/i) #=> true
10460 * # returns true if one of the prefixes matches.
10461 * "hello".start_with?("heaven", "hell") #=> true
10462 * "hello".start_with?("heaven", "paradise") #=> false
10465 static VALUE
10466 rb_str_start_with(int argc, VALUE *argv, VALUE str)
10468 int i;
10470 for (i=0; i<argc; i++) {
10471 VALUE tmp = argv[i];
10472 if (RB_TYPE_P(tmp, T_REGEXP)) {
10473 if (rb_reg_start_with_p(tmp, str))
10474 return Qtrue;
10476 else {
10477 StringValue(tmp);
10478 rb_enc_check(str, tmp);
10479 if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
10480 if (memcmp(RSTRING_PTR(str), RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
10481 return Qtrue;
10484 return Qfalse;
10488 * call-seq:
10489 * str.end_with?([suffixes]+) -> true or false
10491 * Returns true if +str+ ends with one of the +suffixes+ given.
10493 * "hello".end_with?("ello") #=> true
10495 * # returns true if one of the +suffixes+ matches.
10496 * "hello".end_with?("heaven", "ello") #=> true
10497 * "hello".end_with?("heaven", "paradise") #=> false
10500 static VALUE
10501 rb_str_end_with(int argc, VALUE *argv, VALUE str)
10503 int i;
10504 char *p, *s, *e;
10505 rb_encoding *enc;
10507 for (i=0; i<argc; i++) {
10508 VALUE tmp = argv[i];
10509 long slen, tlen;
10510 StringValue(tmp);
10511 enc = rb_enc_check(str, tmp);
10512 if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
10513 if ((slen = RSTRING_LEN(str)) < tlen) continue;
10514 p = RSTRING_PTR(str);
10515 e = p + slen;
10516 s = e - tlen;
10517 if (rb_enc_left_char_head(p, s, e, enc) != s)
10518 continue;
10519 if (memcmp(s, RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
10520 return Qtrue;
10522 return Qfalse;
10526 * Returns the length of the <i>prefix</i> to be deleted in the given <i>str</i>,
10527 * returning 0 if <i>str</i> does not start with the <i>prefix</i>.
10529 * @param str the target
10530 * @param prefix the prefix
10531 * @retval 0 if the given <i>str</i> does not start with the given <i>prefix</i>
10532 * @retval Positive-Integer otherwise
10534 static long
10535 deleted_prefix_length(VALUE str, VALUE prefix)
10537 char *strptr, *prefixptr;
10538 long olen, prefixlen;
10540 StringValue(prefix);
10541 if (is_broken_string(prefix)) return 0;
10542 rb_enc_check(str, prefix);
10544 /* return 0 if not start with prefix */
10545 prefixlen = RSTRING_LEN(prefix);
10546 if (prefixlen <= 0) return 0;
10547 olen = RSTRING_LEN(str);
10548 if (olen < prefixlen) return 0;
10549 strptr = RSTRING_PTR(str);
10550 prefixptr = RSTRING_PTR(prefix);
10551 if (memcmp(strptr, prefixptr, prefixlen) != 0) return 0;
10553 return prefixlen;
10557 * call-seq:
10558 * str.delete_prefix!(prefix) -> self or nil
10560 * Deletes leading <code>prefix</code> from <i>str</i>, returning
10561 * <code>nil</code> if no change was made.
10563 * "hello".delete_prefix!("hel") #=> "lo"
10564 * "hello".delete_prefix!("llo") #=> nil
10567 static VALUE
10568 rb_str_delete_prefix_bang(VALUE str, VALUE prefix)
10570 long prefixlen;
10571 str_modify_keep_cr(str);
10573 prefixlen = deleted_prefix_length(str, prefix);
10574 if (prefixlen <= 0) return Qnil;
10576 return rb_str_drop_bytes(str, prefixlen);
10580 * call-seq:
10581 * str.delete_prefix(prefix) -> new_str
10583 * Returns a copy of <i>str</i> with leading <code>prefix</code> deleted.
10585 * "hello".delete_prefix("hel") #=> "lo"
10586 * "hello".delete_prefix("llo") #=> "hello"
10589 static VALUE
10590 rb_str_delete_prefix(VALUE str, VALUE prefix)
10592 long prefixlen;
10594 prefixlen = deleted_prefix_length(str, prefix);
10595 if (prefixlen <= 0) return str_duplicate(rb_cString, str);
10597 return rb_str_subseq(str, prefixlen, RSTRING_LEN(str) - prefixlen);
10601 * Returns the length of the <i>suffix</i> to be deleted in the given <i>str</i>,
10602 * returning 0 if <i>str</i> does not end with the <i>suffix</i>.
10604 * @param str the target
10605 * @param suffix the suffix
10606 * @retval 0 if the given <i>str</i> does not end with the given <i>suffix</i>
10607 * @retval Positive-Integer otherwise
10609 static long
10610 deleted_suffix_length(VALUE str, VALUE suffix)
10612 char *strptr, *suffixptr, *s;
10613 long olen, suffixlen;
10614 rb_encoding *enc;
10616 StringValue(suffix);
10617 if (is_broken_string(suffix)) return 0;
10618 enc = rb_enc_check(str, suffix);
10620 /* return 0 if not start with suffix */
10621 suffixlen = RSTRING_LEN(suffix);
10622 if (suffixlen <= 0) return 0;
10623 olen = RSTRING_LEN(str);
10624 if (olen < suffixlen) return 0;
10625 strptr = RSTRING_PTR(str);
10626 suffixptr = RSTRING_PTR(suffix);
10627 s = strptr + olen - suffixlen;
10628 if (memcmp(s, suffixptr, suffixlen) != 0) return 0;
10629 if (rb_enc_left_char_head(strptr, s, strptr + olen, enc) != s) return 0;
10631 return suffixlen;
10635 * call-seq:
10636 * str.delete_suffix!(suffix) -> self or nil
10638 * Deletes trailing <code>suffix</code> from <i>str</i>, returning
10639 * <code>nil</code> if no change was made.
10641 * "hello".delete_suffix!("llo") #=> "he"
10642 * "hello".delete_suffix!("hel") #=> nil
10645 static VALUE
10646 rb_str_delete_suffix_bang(VALUE str, VALUE suffix)
10648 long olen, suffixlen, len;
10649 str_modifiable(str);
10651 suffixlen = deleted_suffix_length(str, suffix);
10652 if (suffixlen <= 0) return Qnil;
10654 olen = RSTRING_LEN(str);
10655 str_modify_keep_cr(str);
10656 len = olen - suffixlen;
10657 STR_SET_LEN(str, len);
10658 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
10659 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
10660 ENC_CODERANGE_CLEAR(str);
10662 return str;
10666 * call-seq:
10667 * str.delete_suffix(suffix) -> new_str
10669 * Returns a copy of <i>str</i> with trailing <code>suffix</code> deleted.
10671 * "hello".delete_suffix("llo") #=> "he"
10672 * "hello".delete_suffix("hel") #=> "hello"
10675 static VALUE
10676 rb_str_delete_suffix(VALUE str, VALUE suffix)
10678 long suffixlen;
10680 suffixlen = deleted_suffix_length(str, suffix);
10681 if (suffixlen <= 0) return str_duplicate(rb_cString, str);
10683 return rb_str_subseq(str, 0, RSTRING_LEN(str) - suffixlen);
10686 void
10687 rb_str_setter(VALUE val, ID id, VALUE *var)
10689 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING)) {
10690 rb_raise(rb_eTypeError, "value of %"PRIsVALUE" must be String", rb_id2str(id));
10692 *var = val;
10695 static void
10696 rb_fs_setter(VALUE val, ID id, VALUE *var)
10698 val = rb_fs_check(val);
10699 if (!val) {
10700 rb_raise(rb_eTypeError,
10701 "value of %"PRIsVALUE" must be String or Regexp",
10702 rb_id2str(id));
10704 if (!NIL_P(val)) {
10705 rb_warn_deprecated("`$;'", NULL);
10707 *var = val;
10712 * call-seq:
10713 * str.force_encoding(encoding) -> str
10715 * Changes the encoding to +encoding+ and returns self.
10718 static VALUE
10719 rb_str_force_encoding(VALUE str, VALUE enc)
10721 str_modifiable(str);
10722 rb_enc_associate(str, rb_to_encoding(enc));
10723 ENC_CODERANGE_CLEAR(str);
10724 return str;
10728 * call-seq:
10729 * str.b -> str
10731 * Returns a copied string whose encoding is ASCII-8BIT.
10734 static VALUE
10735 rb_str_b(VALUE str)
10737 VALUE str2;
10738 if (FL_TEST(str, STR_NOEMBED)) {
10739 str2 = str_alloc_heap(rb_cString);
10741 else {
10742 str2 = str_alloc_embed(rb_cString, RSTRING_EMBED_LEN(str) + TERM_LEN(str));
10744 str_replace_shared_without_enc(str2, str);
10745 ENC_CODERANGE_CLEAR(str2);
10746 return str2;
10750 * call-seq:
10751 * str.valid_encoding? -> true or false
10753 * Returns true for a string which is encoded correctly.
10755 * "\xc2\xa1".force_encoding("UTF-8").valid_encoding? #=> true
10756 * "\xc2".force_encoding("UTF-8").valid_encoding? #=> false
10757 * "\x80".force_encoding("UTF-8").valid_encoding? #=> false
10760 static VALUE
10761 rb_str_valid_encoding_p(VALUE str)
10763 int cr = rb_enc_str_coderange(str);
10765 return RBOOL(cr != ENC_CODERANGE_BROKEN);
10769 * call-seq:
10770 * str.ascii_only? -> true or false
10772 * Returns true for a string which has only ASCII characters.
10774 * "abc".force_encoding("UTF-8").ascii_only? #=> true
10775 * "abc\u{6666}".force_encoding("UTF-8").ascii_only? #=> false
10778 static VALUE
10779 rb_str_is_ascii_only_p(VALUE str)
10781 int cr = rb_enc_str_coderange(str);
10783 return RBOOL(cr == ENC_CODERANGE_7BIT);
10786 VALUE
10787 rb_str_ellipsize(VALUE str, long len)
10789 static const char ellipsis[] = "...";
10790 const long ellipsislen = sizeof(ellipsis) - 1;
10791 rb_encoding *const enc = rb_enc_get(str);
10792 const long blen = RSTRING_LEN(str);
10793 const char *const p = RSTRING_PTR(str), *e = p + blen;
10794 VALUE estr, ret = 0;
10796 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
10797 if (len * rb_enc_mbminlen(enc) >= blen ||
10798 (e = rb_enc_nth(p, e, len, enc)) - p == blen) {
10799 ret = str;
10801 else if (len <= ellipsislen ||
10802 !(e = rb_enc_step_back(p, e, e, len = ellipsislen, enc))) {
10803 if (rb_enc_asciicompat(enc)) {
10804 ret = rb_str_new(ellipsis, len);
10805 rb_enc_associate(ret, enc);
10807 else {
10808 estr = rb_usascii_str_new(ellipsis, len);
10809 ret = rb_str_encode(estr, rb_enc_from_encoding(enc), 0, Qnil);
10812 else if (ret = rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
10813 rb_str_cat(ret, ellipsis, ellipsislen);
10815 else {
10816 estr = rb_str_encode(rb_usascii_str_new(ellipsis, ellipsislen),
10817 rb_enc_from_encoding(enc), 0, Qnil);
10818 rb_str_append(ret, estr);
10820 return ret;
10823 static VALUE
10824 str_compat_and_valid(VALUE str, rb_encoding *enc)
10826 int cr;
10827 str = StringValue(str);
10828 cr = rb_enc_str_coderange(str);
10829 if (cr == ENC_CODERANGE_BROKEN) {
10830 rb_raise(rb_eArgError, "replacement must be valid byte sequence '%+"PRIsVALUE"'", str);
10832 else {
10833 rb_encoding *e = STR_ENC_GET(str);
10834 if (cr == ENC_CODERANGE_7BIT ? rb_enc_mbminlen(enc) != 1 : enc != e) {
10835 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
10836 rb_enc_name(enc), rb_enc_name(e));
10839 return str;
10842 static VALUE enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr);
10844 VALUE
10845 rb_str_scrub(VALUE str, VALUE repl)
10847 rb_encoding *enc = STR_ENC_GET(str);
10848 return enc_str_scrub(enc, str, repl, ENC_CODERANGE(str));
10851 VALUE
10852 rb_enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl)
10854 int cr = ENC_CODERANGE_UNKNOWN;
10855 if (enc == STR_ENC_GET(str)) {
10856 /* cached coderange makes sense only when enc equals the
10857 * actual encoding of str */
10858 cr = ENC_CODERANGE(str);
10860 return enc_str_scrub(enc, str, repl, cr);
10863 static VALUE
10864 enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr)
10866 int encidx;
10867 VALUE buf = Qnil;
10868 const char *rep, *p, *e, *p1, *sp;
10869 long replen = -1;
10870 long slen;
10872 if (rb_block_given_p()) {
10873 if (!NIL_P(repl))
10874 rb_raise(rb_eArgError, "both of block and replacement given");
10875 replen = 0;
10878 if (ENC_CODERANGE_CLEAN_P(cr))
10879 return Qnil;
10881 if (!NIL_P(repl)) {
10882 repl = str_compat_and_valid(repl, enc);
10885 if (rb_enc_dummy_p(enc)) {
10886 return Qnil;
10888 encidx = rb_enc_to_index(enc);
10890 #define DEFAULT_REPLACE_CHAR(str) do { \
10891 static const char replace[sizeof(str)-1] = str; \
10892 rep = replace; replen = (int)sizeof(replace); \
10893 } while (0)
10895 slen = RSTRING_LEN(str);
10896 p = RSTRING_PTR(str);
10897 e = RSTRING_END(str);
10898 p1 = p;
10899 sp = p;
10901 if (rb_enc_asciicompat(enc)) {
10902 int rep7bit_p;
10903 if (!replen) {
10904 rep = NULL;
10905 rep7bit_p = FALSE;
10907 else if (!NIL_P(repl)) {
10908 rep = RSTRING_PTR(repl);
10909 replen = RSTRING_LEN(repl);
10910 rep7bit_p = (ENC_CODERANGE(repl) == ENC_CODERANGE_7BIT);
10912 else if (encidx == rb_utf8_encindex()) {
10913 DEFAULT_REPLACE_CHAR("\xEF\xBF\xBD");
10914 rep7bit_p = FALSE;
10916 else {
10917 DEFAULT_REPLACE_CHAR("?");
10918 rep7bit_p = TRUE;
10920 cr = ENC_CODERANGE_7BIT;
10922 p = search_nonascii(p, e);
10923 if (!p) {
10924 p = e;
10926 while (p < e) {
10927 int ret = rb_enc_precise_mbclen(p, e, enc);
10928 if (MBCLEN_NEEDMORE_P(ret)) {
10929 break;
10931 else if (MBCLEN_CHARFOUND_P(ret)) {
10932 cr = ENC_CODERANGE_VALID;
10933 p += MBCLEN_CHARFOUND_LEN(ret);
10935 else if (MBCLEN_INVALID_P(ret)) {
10937 * p1~p: valid ascii/multibyte chars
10938 * p ~e: invalid bytes + unknown bytes
10940 long clen = rb_enc_mbmaxlen(enc);
10941 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
10942 if (p > p1) {
10943 rb_str_buf_cat(buf, p1, p - p1);
10946 if (e - p < clen) clen = e - p;
10947 if (clen <= 2) {
10948 clen = 1;
10950 else {
10951 const char *q = p;
10952 clen--;
10953 for (; clen > 1; clen--) {
10954 ret = rb_enc_precise_mbclen(q, q + clen, enc);
10955 if (MBCLEN_NEEDMORE_P(ret)) break;
10956 if (MBCLEN_INVALID_P(ret)) continue;
10957 UNREACHABLE;
10960 if (rep) {
10961 rb_str_buf_cat(buf, rep, replen);
10962 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
10964 else {
10965 repl = rb_yield(rb_enc_str_new(p, clen, enc));
10966 str_mod_check(str, sp, slen);
10967 repl = str_compat_and_valid(repl, enc);
10968 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
10969 if (ENC_CODERANGE(repl) == ENC_CODERANGE_VALID)
10970 cr = ENC_CODERANGE_VALID;
10972 p += clen;
10973 p1 = p;
10974 p = search_nonascii(p, e);
10975 if (!p) {
10976 p = e;
10977 break;
10980 else {
10981 UNREACHABLE;
10984 if (NIL_P(buf)) {
10985 if (p == e) {
10986 ENC_CODERANGE_SET(str, cr);
10987 return Qnil;
10989 buf = rb_str_buf_new(RSTRING_LEN(str));
10991 if (p1 < p) {
10992 rb_str_buf_cat(buf, p1, p - p1);
10994 if (p < e) {
10995 if (rep) {
10996 rb_str_buf_cat(buf, rep, replen);
10997 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
10999 else {
11000 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11001 str_mod_check(str, sp, slen);
11002 repl = str_compat_and_valid(repl, enc);
11003 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11004 if (ENC_CODERANGE(repl) == ENC_CODERANGE_VALID)
11005 cr = ENC_CODERANGE_VALID;
11009 else {
11010 /* ASCII incompatible */
11011 long mbminlen = rb_enc_mbminlen(enc);
11012 if (!replen) {
11013 rep = NULL;
11015 else if (!NIL_P(repl)) {
11016 rep = RSTRING_PTR(repl);
11017 replen = RSTRING_LEN(repl);
11019 else if (encidx == ENCINDEX_UTF_16BE) {
11020 DEFAULT_REPLACE_CHAR("\xFF\xFD");
11022 else if (encidx == ENCINDEX_UTF_16LE) {
11023 DEFAULT_REPLACE_CHAR("\xFD\xFF");
11025 else if (encidx == ENCINDEX_UTF_32BE) {
11026 DEFAULT_REPLACE_CHAR("\x00\x00\xFF\xFD");
11028 else if (encidx == ENCINDEX_UTF_32LE) {
11029 DEFAULT_REPLACE_CHAR("\xFD\xFF\x00\x00");
11031 else {
11032 DEFAULT_REPLACE_CHAR("?");
11035 while (p < e) {
11036 int ret = rb_enc_precise_mbclen(p, e, enc);
11037 if (MBCLEN_NEEDMORE_P(ret)) {
11038 break;
11040 else if (MBCLEN_CHARFOUND_P(ret)) {
11041 p += MBCLEN_CHARFOUND_LEN(ret);
11043 else if (MBCLEN_INVALID_P(ret)) {
11044 const char *q = p;
11045 long clen = rb_enc_mbmaxlen(enc);
11046 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11047 if (p > p1) rb_str_buf_cat(buf, p1, p - p1);
11049 if (e - p < clen) clen = e - p;
11050 if (clen <= mbminlen * 2) {
11051 clen = mbminlen;
11053 else {
11054 clen -= mbminlen;
11055 for (; clen > mbminlen; clen-=mbminlen) {
11056 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11057 if (MBCLEN_NEEDMORE_P(ret)) break;
11058 if (MBCLEN_INVALID_P(ret)) continue;
11059 UNREACHABLE;
11062 if (rep) {
11063 rb_str_buf_cat(buf, rep, replen);
11065 else {
11066 repl = rb_yield(rb_enc_str_new(p, clen, enc));
11067 str_mod_check(str, sp, slen);
11068 repl = str_compat_and_valid(repl, enc);
11069 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11071 p += clen;
11072 p1 = p;
11074 else {
11075 UNREACHABLE;
11078 if (NIL_P(buf)) {
11079 if (p == e) {
11080 ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
11081 return Qnil;
11083 buf = rb_str_buf_new(RSTRING_LEN(str));
11085 if (p1 < p) {
11086 rb_str_buf_cat(buf, p1, p - p1);
11088 if (p < e) {
11089 if (rep) {
11090 rb_str_buf_cat(buf, rep, replen);
11092 else {
11093 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11094 str_mod_check(str, sp, slen);
11095 repl = str_compat_and_valid(repl, enc);
11096 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11099 cr = ENC_CODERANGE_VALID;
11101 ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), cr);
11102 return buf;
11106 * call-seq:
11107 * str.scrub -> new_str
11108 * str.scrub(repl) -> new_str
11109 * str.scrub{|bytes|} -> new_str
11111 * If the string is invalid byte sequence then replace invalid bytes with given replacement
11112 * character, else returns self.
11113 * If block is given, replace invalid bytes with returned value of the block.
11115 * "abc\u3042\x81".scrub #=> "abc\u3042\uFFFD"
11116 * "abc\u3042\x81".scrub("*") #=> "abc\u3042*"
11117 * "abc\u3042\xE3\x80".scrub{|bytes| '<'+bytes.unpack1('H*')+'>' } #=> "abc\u3042<e380>"
11119 static VALUE
11120 str_scrub(int argc, VALUE *argv, VALUE str)
11122 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
11123 VALUE new = rb_str_scrub(str, repl);
11124 return NIL_P(new) ? str_duplicate(rb_cString, str): new;
11128 * call-seq:
11129 * str.scrub! -> str
11130 * str.scrub!(repl) -> str
11131 * str.scrub!{|bytes|} -> str
11133 * If the string is invalid byte sequence then replace invalid bytes with given replacement
11134 * character, else returns self.
11135 * If block is given, replace invalid bytes with returned value of the block.
11137 * "abc\u3042\x81".scrub! #=> "abc\u3042\uFFFD"
11138 * "abc\u3042\x81".scrub!("*") #=> "abc\u3042*"
11139 * "abc\u3042\xE3\x80".scrub!{|bytes| '<'+bytes.unpack1('H*')+'>' } #=> "abc\u3042<e380>"
11141 static VALUE
11142 str_scrub_bang(int argc, VALUE *argv, VALUE str)
11144 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
11145 VALUE new = rb_str_scrub(str, repl);
11146 if (!NIL_P(new)) rb_str_replace(str, new);
11147 return str;
11150 static ID id_normalize;
11151 static ID id_normalized_p;
11152 static VALUE mUnicodeNormalize;
11154 static VALUE
11155 unicode_normalize_common(int argc, VALUE *argv, VALUE str, ID id)
11157 static int UnicodeNormalizeRequired = 0;
11158 VALUE argv2[2];
11160 if (!UnicodeNormalizeRequired) {
11161 rb_require("unicode_normalize/normalize.rb");
11162 UnicodeNormalizeRequired = 1;
11164 argv2[0] = str;
11165 if (rb_check_arity(argc, 0, 1)) argv2[1] = argv[0];
11166 return rb_funcallv(mUnicodeNormalize, id, argc+1, argv2);
11170 * call-seq:
11171 * str.unicode_normalize(form=:nfc)
11173 * Unicode Normalization---Returns a normalized form of +str+,
11174 * using Unicode normalizations NFC, NFD, NFKC, or NFKD.
11175 * The normalization form used is determined by +form+, which can
11176 * be any of the four values +:nfc+, +:nfd+, +:nfkc+, or +:nfkd+.
11177 * The default is +:nfc+.
11179 * If the string is not in a Unicode Encoding, then an Exception is raised.
11180 * In this context, 'Unicode Encoding' means any of UTF-8, UTF-16BE/LE,
11181 * and UTF-32BE/LE, as well as GB18030, UCS_2BE, and UCS_4BE.
11182 * Anything other than UTF-8 is implemented by converting to UTF-8,
11183 * which makes it slower than UTF-8.
11185 * "a\u0300".unicode_normalize #=> "\u00E0"
11186 * "a\u0300".unicode_normalize(:nfc) #=> "\u00E0"
11187 * "\u00E0".unicode_normalize(:nfd) #=> "a\u0300"
11188 * "\xE0".force_encoding('ISO-8859-1').unicode_normalize(:nfd)
11189 * #=> Encoding::CompatibilityError raised
11191 static VALUE
11192 rb_str_unicode_normalize(int argc, VALUE *argv, VALUE str)
11194 return unicode_normalize_common(argc, argv, str, id_normalize);
11198 * call-seq:
11199 * str.unicode_normalize!(form=:nfc)
11201 * Destructive version of String#unicode_normalize, doing Unicode
11202 * normalization in place.
11204 static VALUE
11205 rb_str_unicode_normalize_bang(int argc, VALUE *argv, VALUE str)
11207 return rb_str_replace(str, unicode_normalize_common(argc, argv, str, id_normalize));
11210 /* call-seq:
11211 * str.unicode_normalized?(form=:nfc)
11213 * Checks whether +str+ is in Unicode normalization form +form+,
11214 * which can be any of the four values +:nfc+, +:nfd+, +:nfkc+, or +:nfkd+.
11215 * The default is +:nfc+.
11217 * If the string is not in a Unicode Encoding, then an Exception is raised.
11218 * For details, see String#unicode_normalize.
11220 * "a\u0300".unicode_normalized? #=> false
11221 * "a\u0300".unicode_normalized?(:nfd) #=> true
11222 * "\u00E0".unicode_normalized? #=> true
11223 * "\u00E0".unicode_normalized?(:nfd) #=> false
11224 * "\xE0".force_encoding('ISO-8859-1').unicode_normalized?
11225 * #=> Encoding::CompatibilityError raised
11227 static VALUE
11228 rb_str_unicode_normalized_p(int argc, VALUE *argv, VALUE str)
11230 return unicode_normalize_common(argc, argv, str, id_normalized_p);
11233 /**********************************************************************
11234 * Document-class: Symbol
11236 * Symbol objects represent named identifiers inside the Ruby interpreter.
11238 * You can create a \Symbol object explicitly with:
11240 * - A {symbol literal}[doc/syntax/literals_rdoc.html#label-Symbol+Literals].
11242 * The same Symbol object will be
11243 * created for a given name or string for the duration of a program's
11244 * execution, regardless of the context or meaning of that name. Thus
11245 * if <code>Fred</code> is a constant in one context, a method in
11246 * another, and a class in a third, the Symbol <code>:Fred</code>
11247 * will be the same object in all three contexts.
11249 * module One
11250 * class Fred
11251 * end
11252 * $f1 = :Fred
11253 * end
11254 * module Two
11255 * Fred = 1
11256 * $f2 = :Fred
11257 * end
11258 * def Fred()
11259 * end
11260 * $f3 = :Fred
11261 * $f1.object_id #=> 2514190
11262 * $f2.object_id #=> 2514190
11263 * $f3.object_id #=> 2514190
11265 * Constant, method, and variable names are returned as symbols:
11267 * module One
11268 * Two = 2
11269 * def three; 3 end
11270 * @four = 4
11271 * @@five = 5
11272 * $six = 6
11273 * end
11274 * seven = 7
11276 * One.constants
11277 * # => [:Two]
11278 * One.instance_methods(true)
11279 * # => [:three]
11280 * One.instance_variables
11281 * # => [:@four]
11282 * One.class_variables
11283 * # => [:@@five]
11284 * global_variables.grep(/six/)
11285 * # => [:$six]
11286 * local_variables
11287 * # => [:seven]
11289 * Symbol objects are different from String objects in that
11290 * Symbol objects represent identifiers, while String objects
11291 * represent text or data.
11293 * == What's Here
11295 * First, what's elsewhere. \Class \Symbol:
11297 * - Inherits from {class Object}[Object.html#class-Object-label-What-27s+Here].
11298 * - Includes {module Comparable}[Comparable.html#module-Comparable-label-What-27s+Here].
11300 * Here, class \Symbol provides methods that are useful for:
11302 * - {Querying}[#class-Symbol-label-Methods+for+Querying]
11303 * - {Comparing}[#class-Symbol-label-Methods+for+Comparing]
11304 * - {Converting}[#class-Symbol-label-Methods+for+Converting]
11306 * === Methods for Querying
11308 * - ::all_symbols:: Returns an array of the symbols currently in Ruby's symbol table.
11309 * - {#=~}[#method-i-3D~]:: Returns the index of the first substring
11310 * in symbol that matches a given Regexp
11311 * or other object; returns +nil+ if no match is found.
11312 * - #[], #slice :: Returns a substring of symbol
11313 * determined by a given index, start/length, or range, or string.
11314 * - #empty?:: Returns +true+ if +self.length+ is zero; +false+ otherwise.
11315 * - #encoding:: Returns the Encoding object that represents the encoding
11316 * of symbol.
11317 * - #end_with?:: Returns +true+ if symbol ends with
11318 * any of the given strings.
11319 * - #match:: Returns a MatchData object if symbol
11320 * matches a given Regexp; +nil+ otherwise.
11321 * - #match?:: Returns +true+ if symbol
11322 * matches a given Regexp; +false+ otherwise.
11323 * - #length, #size:: Returns the number of characters in symbol.
11324 * - #start_with?:: Returns +true+ if symbol starts with
11325 * any of the given strings.
11327 * === Methods for Comparing
11329 * - {#<=>}[#method-i-3C-3D-3E]:: Returns -1, 0, or 1 as a given symbol is smaller than, equal to, or larger than symbol.
11330 * - {#==, #===}[#method-i-3D-3D]:: Returns +true+ if a given symbol
11331 * has the same content and encoding.
11332 * - #casecmp:: Ignoring case, returns -1, 0, or 1 as a given
11333 * symbol is smaller than, equal to, or larger than symbol.
11334 * - #casecmp?:: Returns +true+ if symbol is equal to a given symbol
11335 * after Unicode case folding; +false+ otherwise.
11337 * === Methods for Converting
11339 * - #capitalize:: Returns symbol with the first character upcased
11340 * and all other characters downcased.
11341 * - #downcase:: Returns symbol with all characters downcased.
11342 * - #inspect:: Returns the string representation of +self+ as a symbol literal.
11343 * - #name:: Returns the frozen string corresponding to symbol.
11344 * - #succ, #next:: Returns the symbol that is the successor to symbol.
11345 * - #swapcase:: Returns symbol with all upcase characters downcased
11346 * and all downcase characters upcased.
11347 * - #to_proc:: Returns a Proc object which responds to the method named by symbol.
11348 * - #to_s, #id2name:: Returns the string corresponding to +self+.
11349 * - #to_sym, #intern:: Returns +self+.
11350 * - #upcase:: Returns symbol with all characters upcased.
11356 * call-seq:
11357 * sym == obj -> true or false
11359 * Equality---If <i>sym</i> and <i>obj</i> are exactly the same
11360 * symbol, returns <code>true</code>.
11363 #define sym_equal rb_obj_equal
11365 static int
11366 sym_printable(const char *s, const char *send, rb_encoding *enc)
11368 while (s < send) {
11369 int n;
11370 int c = rb_enc_precise_mbclen(s, send, enc);
11372 if (!MBCLEN_CHARFOUND_P(c)) return FALSE;
11373 n = MBCLEN_CHARFOUND_LEN(c);
11374 c = rb_enc_mbc_to_codepoint(s, send, enc);
11375 if (!rb_enc_isprint(c, enc)) return FALSE;
11376 s += n;
11378 return TRUE;
11382 rb_str_symname_p(VALUE sym)
11384 rb_encoding *enc;
11385 const char *ptr;
11386 long len;
11387 rb_encoding *resenc = rb_default_internal_encoding();
11389 if (resenc == NULL) resenc = rb_default_external_encoding();
11390 enc = STR_ENC_GET(sym);
11391 ptr = RSTRING_PTR(sym);
11392 len = RSTRING_LEN(sym);
11393 if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) ||
11394 !rb_enc_symname2_p(ptr, len, enc) || !sym_printable(ptr, ptr + len, enc)) {
11395 return FALSE;
11397 return TRUE;
11400 VALUE
11401 rb_str_quote_unprintable(VALUE str)
11403 rb_encoding *enc;
11404 const char *ptr;
11405 long len;
11406 rb_encoding *resenc;
11408 Check_Type(str, T_STRING);
11409 resenc = rb_default_internal_encoding();
11410 if (resenc == NULL) resenc = rb_default_external_encoding();
11411 enc = STR_ENC_GET(str);
11412 ptr = RSTRING_PTR(str);
11413 len = RSTRING_LEN(str);
11414 if ((resenc != enc && !rb_str_is_ascii_only_p(str)) ||
11415 !sym_printable(ptr, ptr + len, enc)) {
11416 return rb_str_escape(str);
11418 return str;
11421 MJIT_FUNC_EXPORTED VALUE
11422 rb_id_quote_unprintable(ID id)
11424 VALUE str = rb_id2str(id);
11425 if (!rb_str_symname_p(str)) {
11426 return rb_str_escape(str);
11428 return str;
11432 * call-seq:
11433 * sym.inspect -> string
11435 * Returns the representation of <i>sym</i> as a symbol literal.
11437 * :fred.inspect #=> ":fred"
11440 static VALUE
11441 sym_inspect(VALUE sym)
11443 VALUE str = rb_sym2str(sym);
11444 const char *ptr;
11445 long len;
11446 char *dest;
11448 if (!rb_str_symname_p(str)) {
11449 str = rb_str_inspect(str);
11450 len = RSTRING_LEN(str);
11451 rb_str_resize(str, len + 1);
11452 dest = RSTRING_PTR(str);
11453 memmove(dest + 1, dest, len);
11455 else {
11456 rb_encoding *enc = STR_ENC_GET(str);
11457 RSTRING_GETMEM(str, ptr, len);
11458 str = rb_enc_str_new(0, len + 1, enc);
11459 dest = RSTRING_PTR(str);
11460 memcpy(dest + 1, ptr, len);
11462 dest[0] = ':';
11463 return str;
11466 #if 0 /* for RDoc */
11468 * call-seq:
11469 * sym.name -> string
11471 * Returns the name or string corresponding to <i>sym</i>. Unlike #to_s, the
11472 * returned string is frozen.
11474 * :fred.name #=> "fred"
11475 * :fred.name.frozen? #=> true
11476 * :fred.to_s #=> "fred"
11477 * :fred.to_s.frozen? #=> false
11479 VALUE
11480 rb_sym2str(VALUE sym)
11484 #endif
11488 * call-seq:
11489 * sym.id2name -> string
11490 * sym.to_s -> string
11492 * Returns the name or string corresponding to <i>sym</i>.
11494 * :fred.id2name #=> "fred"
11495 * :ginger.to_s #=> "ginger"
11497 * Note that this string is not frozen (unlike the symbol itself).
11498 * To get a frozen string, use #name.
11502 VALUE
11503 rb_sym_to_s(VALUE sym)
11505 return str_new_shared(rb_cString, rb_sym2str(sym));
11510 * call-seq:
11511 * sym.to_sym -> sym
11512 * sym.intern -> sym
11514 * In general, <code>to_sym</code> returns the Symbol corresponding
11515 * to an object. As <i>sym</i> is already a symbol, <code>self</code> is returned
11516 * in this case.
11519 static VALUE
11520 sym_to_sym(VALUE sym)
11522 return sym;
11525 MJIT_FUNC_EXPORTED VALUE
11526 rb_sym_proc_call(ID mid, int argc, const VALUE *argv, int kw_splat, VALUE passed_proc)
11528 VALUE obj;
11530 if (argc < 1) {
11531 rb_raise(rb_eArgError, "no receiver given");
11533 obj = argv[0];
11534 return rb_funcall_with_block_kw(obj, mid, argc - 1, argv + 1, passed_proc, kw_splat);
11537 #if 0
11539 * call-seq:
11540 * sym.to_proc
11542 * Returns a _Proc_ object which responds to the given method by _sym_.
11544 * (1..3).collect(&:to_s) #=> ["1", "2", "3"]
11547 VALUE
11548 rb_sym_to_proc(VALUE sym)
11551 #endif
11554 * call-seq:
11556 * sym.succ
11558 * Same as <code>sym.to_s.succ.intern</code>.
11561 static VALUE
11562 sym_succ(VALUE sym)
11564 return rb_str_intern(rb_str_succ(rb_sym2str(sym)));
11568 * call-seq:
11570 * symbol <=> other_symbol -> -1, 0, +1, or nil
11572 * Compares +symbol+ with +other_symbol+ after calling #to_s on each of the
11573 * symbols. Returns -1, 0, +1, or +nil+ depending on whether +symbol+ is
11574 * less than, equal to, or greater than +other_symbol+.
11576 * +nil+ is returned if the two values are incomparable.
11578 * See String#<=> for more information.
11581 static VALUE
11582 sym_cmp(VALUE sym, VALUE other)
11584 if (!SYMBOL_P(other)) {
11585 return Qnil;
11587 return rb_str_cmp_m(rb_sym2str(sym), rb_sym2str(other));
11591 * call-seq:
11592 * casecmp(other_symbol) -> -1, 0, 1, or nil
11594 * Case-insensitive version of {Symbol#<=>}[#method-i-3C-3D-3E]:
11596 * :aBcDeF.casecmp(:abcde) # => 1
11597 * :aBcDeF.casecmp(:abcdef) # => 0
11598 * :aBcDeF.casecmp(:abcdefg) # => -1
11599 * :abcdef.casecmp(:ABCDEF) # => 0
11601 * Returns +nil+ if the two symbols have incompatible encodings,
11602 * or if +other_symbol+ is not a symbol:
11604 * sym = "\u{e4 f6 fc}".encode("ISO-8859-1").to_sym
11605 * other_sym = :"\u{c4 d6 dc}"
11606 * sym.casecmp(other_sym) # => nil
11607 * :foo.casecmp(2) # => nil
11609 * Currently, case-insensitivity only works on characters A-Z/a-z,
11610 * not all of Unicode. This is different from Symbol#casecmp?.
11612 * Related: Symbol#casecmp?.
11616 static VALUE
11617 sym_casecmp(VALUE sym, VALUE other)
11619 if (!SYMBOL_P(other)) {
11620 return Qnil;
11622 return str_casecmp(rb_sym2str(sym), rb_sym2str(other));
11626 * call-seq:
11627 * casecmp?(other_symbol) -> true, false, or nil
11629 * Returns +true+ if +sym+ and +other_symbol+ are equal after
11630 * Unicode case folding, +false+ if they are not equal:
11632 * :aBcDeF.casecmp?(:abcde) # => false
11633 * :aBcDeF.casecmp?(:abcdef) # => true
11634 * :aBcDeF.casecmp?(:abcdefg) # => false
11635 * :abcdef.casecmp?(:ABCDEF) # => true
11636 * :"\u{e4 f6 fc}".casecmp?(:"\u{c4 d6 dc}") #=> true
11638 * Returns +nil+ if the two symbols have incompatible encodings,
11639 * or if +other_symbol+ is not a symbol:
11641 * sym = "\u{e4 f6 fc}".encode("ISO-8859-1").to_sym
11642 * other_sym = :"\u{c4 d6 dc}"
11643 * sym.casecmp?(other_sym) # => nil
11644 * :foo.casecmp?(2) # => nil
11646 * See {Case Mapping}[doc/case_mapping_rdoc.html].
11648 * Related: Symbol#casecmp.
11652 static VALUE
11653 sym_casecmp_p(VALUE sym, VALUE other)
11655 if (!SYMBOL_P(other)) {
11656 return Qnil;
11658 return str_casecmp_p(rb_sym2str(sym), rb_sym2str(other));
11662 * call-seq:
11663 * sym =~ obj -> integer or nil
11665 * Returns <code>sym.to_s =~ obj</code>.
11668 static VALUE
11669 sym_match(VALUE sym, VALUE other)
11671 return rb_str_match(rb_sym2str(sym), other);
11675 * call-seq:
11676 * sym.match(pattern) -> matchdata or nil
11677 * sym.match(pattern, pos) -> matchdata or nil
11679 * Returns <code>sym.to_s.match</code>.
11682 static VALUE
11683 sym_match_m(int argc, VALUE *argv, VALUE sym)
11685 return rb_str_match_m(argc, argv, rb_sym2str(sym));
11689 * call-seq:
11690 * sym.match?(pattern) -> true or false
11691 * sym.match?(pattern, pos) -> true or false
11693 * Returns <code>sym.to_s.match?</code>.
11696 static VALUE
11697 sym_match_m_p(int argc, VALUE *argv, VALUE sym)
11699 return rb_str_match_m_p(argc, argv, sym);
11703 * call-seq:
11704 * sym[idx] -> char
11705 * sym[b, n] -> string
11706 * sym.slice(idx) -> char
11707 * sym.slice(b, n) -> string
11709 * Returns <code>sym.to_s[]</code>.
11712 static VALUE
11713 sym_aref(int argc, VALUE *argv, VALUE sym)
11715 return rb_str_aref_m(argc, argv, rb_sym2str(sym));
11719 * call-seq:
11720 * sym.length -> integer
11721 * sym.size -> integer
11723 * Same as <code>sym.to_s.length</code>.
11726 static VALUE
11727 sym_length(VALUE sym)
11729 return rb_str_length(rb_sym2str(sym));
11733 * call-seq:
11734 * sym.empty? -> true or false
11736 * Returns whether _sym_ is :"" or not.
11739 static VALUE
11740 sym_empty(VALUE sym)
11742 return rb_str_empty(rb_sym2str(sym));
11746 * call-seq:
11747 * upcase(*options) -> symbol
11749 * Equivalent to <tt>sym.to_s.upcase.to_sym</tt>.
11751 * See String#upcase.
11755 static VALUE
11756 sym_upcase(int argc, VALUE *argv, VALUE sym)
11758 return rb_str_intern(rb_str_upcase(argc, argv, rb_sym2str(sym)));
11762 * call-seq:
11763 * downcase(*options) -> symbol
11765 * Equivalent to <tt>sym.to_s.downcase.to_sym</tt>.
11767 * See String#downcase.
11769 * Related: Symbol#upcase.
11773 static VALUE
11774 sym_downcase(int argc, VALUE *argv, VALUE sym)
11776 return rb_str_intern(rb_str_downcase(argc, argv, rb_sym2str(sym)));
11780 * call-seq:
11781 * capitalize(*options) -> symbol
11783 * Equivalent to <tt>sym.to_s.capitalize.to_sym</tt>.
11785 * See String#capitalize.
11789 static VALUE
11790 sym_capitalize(int argc, VALUE *argv, VALUE sym)
11792 return rb_str_intern(rb_str_capitalize(argc, argv, rb_sym2str(sym)));
11796 * call-seq:
11797 * swapcase(*options) -> symbol
11799 * Equivalent to <tt>sym.to_s.swapcase.to_sym</tt>.
11801 * See String#swapcase.
11805 static VALUE
11806 sym_swapcase(int argc, VALUE *argv, VALUE sym)
11808 return rb_str_intern(rb_str_swapcase(argc, argv, rb_sym2str(sym)));
11812 * call-seq:
11813 * sym.start_with?([prefixes]+) -> true or false
11815 * Returns true if +sym+ starts with one of the +prefixes+ given.
11816 * Each of the +prefixes+ should be a String or a Regexp.
11818 * :hello.start_with?("hell") #=> true
11819 * :hello.start_with?(/H/i) #=> true
11821 * # returns true if one of the prefixes matches.
11822 * :hello.start_with?("heaven", "hell") #=> true
11823 * :hello.start_with?("heaven", "paradise") #=> false
11826 static VALUE
11827 sym_start_with(int argc, VALUE *argv, VALUE sym)
11829 return rb_str_start_with(argc, argv, rb_sym2str(sym));
11833 * call-seq:
11834 * sym.end_with?([suffixes]+) -> true or false
11836 * Returns true if +sym+ ends with one of the +suffixes+ given.
11838 * :hello.end_with?("ello") #=> true
11840 * # returns true if one of the +suffixes+ matches.
11841 * :hello.end_with?("heaven", "ello") #=> true
11842 * :hello.end_with?("heaven", "paradise") #=> false
11845 static VALUE
11846 sym_end_with(int argc, VALUE *argv, VALUE sym)
11848 return rb_str_end_with(argc, argv, rb_sym2str(sym));
11852 * call-seq:
11853 * sym.encoding -> encoding
11855 * Returns the Encoding object that represents the encoding of _sym_.
11858 static VALUE
11859 sym_encoding(VALUE sym)
11861 return rb_obj_encoding(rb_sym2str(sym));
11864 static VALUE
11865 string_for_symbol(VALUE name)
11867 if (!RB_TYPE_P(name, T_STRING)) {
11868 VALUE tmp = rb_check_string_type(name);
11869 if (NIL_P(tmp)) {
11870 rb_raise(rb_eTypeError, "%+"PRIsVALUE" is not a symbol",
11871 name);
11873 name = tmp;
11875 return name;
11879 rb_to_id(VALUE name)
11881 if (SYMBOL_P(name)) {
11882 return SYM2ID(name);
11884 name = string_for_symbol(name);
11885 return rb_intern_str(name);
11888 VALUE
11889 rb_to_symbol(VALUE name)
11891 if (SYMBOL_P(name)) {
11892 return name;
11894 name = string_for_symbol(name);
11895 return rb_str_intern(name);
11899 * call-seq:
11900 * Symbol.all_symbols => array
11902 * Returns an array of all the symbols currently in Ruby's symbol
11903 * table.
11905 * Symbol.all_symbols.size #=> 903
11906 * Symbol.all_symbols[1,20] #=> [:floor, :ARGV, :Binding, :symlink,
11907 * :chown, :EOFError, :$;, :String,
11908 * :LOCK_SH, :"setuid?", :$<,
11909 * :default_proc, :compact, :extend,
11910 * :Tms, :getwd, :$=, :ThreadGroup,
11911 * :wait2, :$>]
11914 static VALUE
11915 sym_all_symbols(VALUE _)
11917 return rb_sym_all_symbols();
11920 VALUE
11921 rb_str_to_interned_str(VALUE str)
11923 return rb_fstring(str);
11926 VALUE
11927 rb_interned_str(const char *ptr, long len)
11929 struct RString fake_str;
11930 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), TRUE);
11933 VALUE
11934 rb_interned_str_cstr(const char *ptr)
11936 return rb_interned_str(ptr, strlen(ptr));
11939 VALUE
11940 rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
11942 if (UNLIKELY(rb_enc_autoload_p(enc))) {
11943 rb_enc_autoload(enc);
11946 struct RString fake_str;
11947 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), TRUE);
11950 VALUE
11951 rb_enc_interned_str_cstr(const char *ptr, rb_encoding *enc)
11953 return rb_enc_interned_str(ptr, strlen(ptr), enc);
11957 * A \String object has an arbitrary sequence of bytes,
11958 * typically representing text or binary data.
11959 * A \String object may be created using String::new or as literals.
11961 * String objects differ from Symbol objects in that Symbol objects are
11962 * designed to be used as identifiers, instead of text or data.
11964 * You can create a \String object explicitly with:
11966 * - A {string literal}[doc/syntax/literals_rdoc.html#label-String+Literals].
11967 * - A {heredoc literal}[doc/syntax/literals_rdoc.html#label-Here+Document+Literals].
11969 * You can convert certain objects to Strings with:
11971 * - \Method {String}[Kernel.html#method-i-String].
11973 * Some \String methods modify +self+.
11974 * Typically, a method whose name ends with <tt>!</tt> modifies +self+
11975 * and returns +self+;
11976 * often a similarly named method (without the <tt>!</tt>)
11977 * returns a new string.
11979 * In general, if there exist both bang and non-bang version of method,
11980 * the bang! mutates and the non-bang! does not.
11981 * However, a method without a bang can also mutate, such as String#replace.
11983 * == Substitution Methods
11985 * These methods perform substitutions:
11987 * - String#sub: One substitution (or none); returns a new string.
11988 * - String#sub!: One substitution (or none); returns +self+.
11989 * - String#gsub: Zero or more substitutions; returns a new string.
11990 * - String#gsub!: Zero or more substitutions; returns +self+.
11992 * Each of these methods takes:
11994 * - A first argument, +pattern+ (string or regexp),
11995 * that specifies the substring(s) to be replaced.
11997 * - Either of these:
11999 * - A second argument, +replacement+ (string or hash),
12000 * that determines the replacing string.
12001 * - A block that will determine the replacing string.
12003 * The examples in this section mostly use methods String#sub and String#gsub;
12004 * the principles illustrated apply to all four substitution methods.
12006 * <b>Argument +pattern+</b>
12008 * Argument +pattern+ is commonly a regular expression:
12010 * s = 'hello'
12011 * s.sub(/[aeiou]/, '*') # => "h*llo"
12012 * s.gsub(/[aeiou]/, '*') # => "h*ll*"
12013 * s.gsub(/[aeiou]/, '') # => "hll"
12014 * s.sub(/ell/, 'al') # => "halo"
12015 * s.gsub(/xyzzy/, '*') # => "hello"
12016 * 'THX1138'.gsub(/\d+/, '00') # => "THX00"
12018 * When +pattern+ is a string, all its characters are treated
12019 * as ordinary characters (not as regexp special characters):
12021 * 'THX1138'.gsub('\d+', '00') # => "THX1138"
12023 * <b>\String +replacement+</b>
12025 * If +replacement+ is a string, that string will determine
12026 * the replacing string that is to be substituted for the matched text.
12028 * Each of the examples above uses a simple string as the replacing string.
12030 * \String +replacement+ may contain back-references to the pattern's captures:
12032 * - <tt>\n</tt> (_n_ a non-negative integer) refers to <tt>$n</tt>.
12033 * - <tt>\k<name></tt> refers to the named capture +name+.
12035 * See rdoc-ref:regexp.rdoc for details.
12037 * Note that within the string +replacement+, a character combination
12038 * such as <tt>$&</tt> is treated as ordinary text, and not as
12039 * a special match variable.
12040 * However, you may refer to some special match variables using these
12041 * combinations:
12043 * - <tt>\&</tt> and <tt>\0</tt> correspond to <tt>$&</tt>,
12044 * which contains the complete matched text.
12045 * - <tt>\'</tt> corresponds to <tt>$'</tt>,
12046 * which contains string after match.
12047 * - <tt>\`</tt> corresponds to <tt>$`</tt>,
12048 * which contains string before match.
12049 * - <tt>\+</tt> corresponds to <tt>$+</tt>,
12050 * which contains last capture group.
12052 * See rdoc-ref:regexp.rdoc for details.
12054 * Note that <tt>\\\\</tt> is interpreted as an escape, i.e., a single backslash.
12056 * Note also that a string literal consumes backslashes.
12057 * See {String Literals}[doc/syntax/literals_rdoc.html#label-String+Literals] for details about string literals.
12059 * A back-reference is typically preceded by an additional backslash.
12060 * For example, if you want to write a back-reference <tt>\&</tt> in
12061 * +replacement+ with a double-quoted string literal, you need to write
12062 * <tt>"..\\\\&.."</tt>.
12064 * If you want to write a non-back-reference string <tt>\&</tt> in
12065 * +replacement+, you need first to escape the backslash to prevent
12066 * this method from interpreting it as a back-reference, and then you
12067 * need to escape the backslashes again to prevent a string literal from
12068 * consuming them: <tt>"..\\\\\\\\&.."</tt>.
12070 * You may want to use the block form to avoid a lot of backslashes.
12072 * <b>\Hash +replacement+</b>
12074 * If argument +replacement+ is a hash, and +pattern+ matches one of its keys,
12075 * the replacing string is the value for that key:
12077 * h = {'foo' => 'bar', 'baz' => 'bat'}
12078 * 'food'.sub('foo', h) # => "bard"
12080 * Note that a symbol key does not match:
12082 * h = {foo: 'bar', baz: 'bat'}
12083 * 'food'.sub('foo', h) # => "d"
12085 * <b>Block</b>
12087 * In the block form, the current match string is passed to the block;
12088 * the block's return value becomes the replacing string:
12090 * s = '@'
12091 * '1234'.gsub(/\d/) {|match| s.succ! } # => "ABCD"
12093 * Special match variables such as <tt>$1</tt>, <tt>$2</tt>, <tt>$`</tt>,
12094 * <tt>$&</tt>, and <tt>$'</tt> are set appropriately.
12097 * == What's Here
12099 * First, what's elsewhere. \Class \String:
12101 * - Inherits from {class Object}[Object.html#class-Object-label-What-27s+Here].
12102 * - Includes {module Comparable}[Comparable.html#module-Comparable-label-What-27s+Here].
12104 * Here, class \String provides methods that are useful for:
12106 * - {Creating a String}[#class-String-label-Methods+for+Creating+a+String]
12107 * - {Frozen/Unfrozen Strings}[#class-String-label-Methods+for+a+Frozen-2FUnfrozen+String]
12108 * - {Querying}[#class-String-label-Methods+for+Querying]
12109 * - {Comparing}[#class-String-label-Methods+for+Comparing]
12110 * - {Modifying a String}[#class-String-label-Methods+for+Modifying+a+String]
12111 * - {Converting to New String}[#class-String-label-Methods+for+Converting+to+New+String]
12112 * - {Converting to Non-String}[#class-String-label-Methods+for+Converting+to+Non--5CString]
12113 * - {Iterating}[#class-String-label-Methods+for+Iterating]
12115 * === Methods for Creating a \String
12117 * - ::new:: Returns a new string.
12118 * - ::try_convert:: Returns a new string created from a given object.
12120 * === Methods for a Frozen/Unfrozen String
12122 * - {#+string}[#method-i-2B-40]:: Returns a string that is not frozen:
12123 * +self+, if not frozen; +self.dup+ otherwise.
12124 * - {#-string}[#method-i-2D-40]:: Returns a string that is frozen:
12125 * +self+, if already frozen; +self.freeze+ otherwise.
12126 * - #freeze:: Freezes +self+, if not already frozen; returns +self+.
12128 * === Methods for Querying
12130 * _Counts_
12132 * - #length, #size:: Returns the count of characters (not bytes).
12133 * - #empty?:: Returns +true+ if +self.length+ is zero; +false+ otherwise.
12134 * - #bytesize:: Returns the count of bytes.
12135 * - #count:: Returns the count of substrings matching given strings.
12137 * _Substrings_
12139 * - {#=~}[#method-i-3D~]:: Returns the index of the first substring that matches a given Regexp or other object;
12140 * returns +nil+ if no match is found.
12141 * - #index:: Returns the index of the _first_ occurrence of a given substring;
12142 * returns +nil+ if none found.
12143 * - #rindex:: Returns the index of the _last_ occurrence of a given substring;
12144 * returns +nil+ if none found.
12145 * - #include?:: Returns +true+ if the string contains a given substring; +false+ otherwise.
12146 * - #match:: Returns a MatchData object if the string matches a given Regexp; +nil+ otherwise.
12147 * - #match?:: Returns +true+ if the string matches a given Regexp; +false+ otherwise.
12148 * - #start_with?:: Returns +true+ if the string begins with any of the given substrings.
12149 * - #end_with?:: Returns +true+ if the string ends with any of the given substrings.
12151 * _Encodings_
12153 * - #encoding:: Returns the Encoding object that represents the encoding of the string.
12154 * - #unicode_normalized?:: Returns +true+ if the string is in Unicode normalized form; +false+ otherwise.
12155 * - #valid_encoding?:: Returns +true+ if the string contains only characters that are valid
12156 * for its encoding.
12157 * - #ascii_only?:: Returns +true+ if the string has only ASCII characters; +false+ otherwise.
12159 * _Other_
12161 * - #sum:: Returns a basic checksum for the string: the sum of each byte.
12162 * - #hash:: Returns the integer hash code.
12164 * === Methods for Comparing
12166 * - {#==, #===}[#method-i-3D-3D]:: Returns +true+ if a given other string has the same content as +self+.
12167 * - #eql?:: Returns +true+ if the content is the same as the given other string.
12168 * - {#<=>}[#method-i-3C-3D-3E]:: Returns -1, 0, or 1 as a given other string is smaller than, equal to, or larger than +self+.
12169 * - #casecmp:: Ignoring case, returns -1, 0, or 1 as a given
12170 * other string is smaller than, equal to, or larger than +self+.
12171 * - #casecmp?:: Returns +true+ if the string is equal to a given string after Unicode case folding;
12172 * +false+ otherwise.
12174 * === Methods for Modifying a \String
12176 * Each of these methods modifies +self+.
12178 * _Insertion_
12180 * - #insert:: Returns +self+ with a given string inserted at a given offset.
12181 * - #<<:: Returns +self+ concatenated with a given string or integer.
12183 * _Substitution_
12185 * - #sub!:: Replaces the first substring that matches a given pattern with a given replacement string;
12186 * returns +self+ if any changes, +nil+ otherwise.
12187 * - #gsub!:: Replaces each substring that matches a given pattern with a given replacement string;
12188 * returns +self+ if any changes, +nil+ otherwise.
12189 * - #succ!, #next!:: Returns +self+ modified to become its own successor.
12190 * - #replace:: Returns +self+ with its entire content replaced by a given string.
12191 * - #reverse!:: Returns +self+ with its characters in reverse order.
12192 * - #setbyte:: Sets the byte at a given integer offset to a given value; returns the argument.
12193 * - #tr!:: Replaces specified characters in +self+ with specified replacement characters;
12194 * returns +self+ if any changes, +nil+ otherwise.
12195 * - #tr_s!:: Replaces specified characters in +self+ with specified replacement characters,
12196 * removing duplicates from the substrings that were modified;
12197 * returns +self+ if any changes, +nil+ otherwise.
12199 * _Casing_
12201 * - #capitalize!:: Upcases the initial character and downcases all others;
12202 * returns +self+ if any changes, +nil+ otherwise.
12203 * - #downcase!:: Downcases all characters; returns +self+ if any changes, +nil+ otherwise.
12204 * - #upcase!:: Upcases all characters; returns +self+ if any changes, +nil+ otherwise.
12205 * - #swapcase!:: Upcases each downcase character and downcases each upcase character;
12206 * returns +self+ if any changes, +nil+ otherwise.
12208 * _Encoding_
12210 * - #encode!:: Returns +self+ with all characters transcoded from one given encoding into another.
12211 * - #unicode_normalize!:: Unicode-normalizes +self+; returns +self+.
12212 * - #scrub!:: Replaces each invalid byte with a given character; returns +self+.
12213 * - #force_encoding:: Changes the encoding to a given encoding; returns +self+.
12215 * _Deletion_
12217 * - #clear:: Removes all content, so that +self+ is empty; returns +self+.
12218 * - #slice!, #[]=:: Removes a substring determined by a given index, start/length, range, regexp, or substring.
12219 * - #squeeze!:: Removes contiguous duplicate characters; returns +self+.
12220 * - #delete!:: Removes characters as determined by the intersection of substring arguments.
12221 * - #lstrip!:: Removes leading whitespace; returns +self+ if any changes, +nil+ otherwise.
12222 * - #rstrip!:: Removes trailing whitespace; returns +self+ if any changes, +nil+ otherwise.
12223 * - #strip!:: Removes leading and trailing whitespace; returns +self+ if any changes, +nil+ otherwise.
12224 * - #chomp!:: Removes trailing record separator, if found; returns +self+ if any changes, +nil+ otherwise.
12225 * - #chop!:: Removes trailing whitespace if found, otherwise removes the last character;
12226 * returns +self+ if any changes, +nil+ otherwise.
12228 * === Methods for Converting to New \String
12230 * Each of these methods returns a new \String based on +self+,
12231 * often just a modified copy of +self+.
12233 * _Extension_
12235 * - #*:: Returns the concatenation of multiple copies of +self+,
12236 * - #+:: Returns the concatenation of +self+ and a given other string.
12237 * - #center:: Returns a copy of +self+ centered between pad substring.
12238 * - #concat:: Returns the concatenation of +self+ with given other strings.
12239 * - #prepend:: Returns the concatenation of a given other string with +self+.
12240 * - #ljust:: Returns a copy of +self+ of a given length, right-padded with a given other string.
12241 * - #rjust:: Returns a copy of +self+ of a given length, left-padded with a given other string.
12243 * _Encoding_
12245 * - #b:: Returns a copy of +self+ with ASCII-8BIT encoding.
12246 * - #scrub:: Returns a copy of +self+ with each invalid byte replaced with a given character.
12247 * - #unicode_normalize:: Returns a copy of +self+ with each character Unicode-normalized.
12248 * - #encode:: Returns a copy of +self+ with all characters transcoded from one given encoding into another.
12250 * _Substitution_
12252 * - #dump:: Returns a copy of +self with all non-printing characters replaced by \xHH notation
12253 * and all special characters escaped.
12254 * - #undump:: Returns a copy of +self with all <tt>\xNN</tt> notation replace by <tt>\uNNNN</tt> notation
12255 * and all escaped characters unescaped.
12256 * - #sub:: Returns a copy of +self+ with the first substring matching a given pattern
12257 * replaced with a given replacement string;.
12258 * - #gsub:: Returns a copy of +self+ with each substring that matches a given pattern
12259 * replaced with a given replacement string.
12260 * - #succ, #next:: Returns the string that is the successor to +self+.
12261 * - #reverse:: Returns a copy of +self+ with its characters in reverse order.
12262 * - #tr:: Returns a copy of +self+ with specified characters replaced with specified replacement characters.
12263 * - #tr_s:: Returns a copy of +self+ with specified characters replaced with specified replacement characters,
12264 * removing duplicates from the substrings that were modified.
12265 * - #%:: Returns the string resulting from formatting a given object into +self+
12267 * _Casing_
12269 * - #capitalize:: Returns a copy of +self+ with the first character upcased
12270 * and all other characters downcased.
12271 * - #downcase:: Returns a copy of +self+ with all characters downcased.
12272 * - #upcase:: Returns a copy of +self+ with all characters upcased.
12273 * - #swapcase:: Returns a copy of +self+ with all upcase characters downcased
12274 * and all downcase characters upcased.
12276 * _Deletion_
12278 * - #delete:: Returns a copy of +self+ with characters removed
12279 * - #delete_prefix:: Returns a copy of +self+ with a given prefix removed.
12280 * - #delete_suffix:: Returns a copy of +self+ with a given suffix removed.
12281 * - #lstrip:: Returns a copy of +self+ with leading whitespace removed.
12282 * - #rstrip:: Returns a copy of +self+ with trailing whitespace removed.
12283 * - #strip:: Returns a copy of +self+ with leading and trailing whitespace removed.
12284 * - #chomp:: Returns a copy of +self+ with a trailing record separator removed, if found.
12285 * - #chop:: Returns a copy of +self+ with trailing whitespace or the last character removed.
12286 * - #squeeze:: Returns a copy of +self+ with contiguous duplicate characters removed.
12287 * - #[], #slice:: Returns a substring determined by a given index, start/length, or range, or string.
12288 * - #byteslice:: Returns a substring determined by a given index, start/length, or range.
12289 * - #chr:: Returns the first character.
12291 * _Duplication_
12293 * - #to_s, $to_str:: If +self+ is a subclass of \String, returns +self+ copied into a \String;
12294 * otherwise, returns +self+.
12296 * === Methods for Converting to Non-\String
12298 * Each of these methods converts the contents of +self+ to a non-\String.
12300 * <em>Characters, Bytes, and Clusters</em>
12302 * - #bytes:: Returns an array of the bytes in +self+.
12303 * - #chars:: Returns an array of the characters in +self+.
12304 * - #codepoints:: Returns an array of the integer ordinals in +self+.
12305 * - #getbyte:: Returns an integer byte as determined by a given index.
12306 * - #grapheme_clusters:: Returns an array of the grapheme clusters in +self+.
12308 * _Splitting_
12310 * - #lines:: Returns an array of the lines in +self+, as determined by a given record separator.
12311 * - #partition:: Returns a 3-element array determined by the first substring that matches
12312 * a given substring or regexp,
12313 * - #rpartition:: Returns a 3-element array determined by the last substring that matches
12314 * a given substring or regexp,
12315 * - #split:: Returns an array of substrings determined by a given delimiter -- regexp or string --
12316 * or, if a block given, passes those substrings to the block.
12318 * _Matching_
12320 * - #scan:: Returns an array of substrings matching a given regexp or string, or,
12321 * if a block given, passes each matching substring to the block.
12322 * - #unpack:: Returns an array of substrings extracted from +self+ according to a given format.
12323 * - #unpack1:: Returns the first substring extracted from +self+ according to a given format.
12325 * _Numerics_
12327 * - #hex:: Returns the integer value of the leading characters, interpreted as hexadecimal digits.
12328 * - #oct:: Returns the integer value of the leading characters, interpreted as octal digits.
12329 * - #ord:: Returns the integer ordinal of the first character in +self+.
12330 * - #to_i:: Returns the integer value of leading characters, interpreted as an integer.
12331 * - #to_f:: Returns the floating-point value of leading characters, interpreted as a floating-point number.
12333 * <em>Strings and Symbols</em>
12335 * - #inspect:: Returns copy of +self+, enclosed in double-quotes, with special characters escaped.
12336 * - #to_sym, #intern:: Returns the symbol corresponding to +self+.
12338 * === Methods for Iterating
12340 * - #each_byte:: Calls the given block with each successive byte in +self+.
12341 * - #each_char:: Calls the given block with each successive character in +self+.
12342 * - #each_codepoint:: Calls the given block with each successive integer codepoint in +self+.
12343 * - #each_grapheme_cluster:: Calls the given block with each successive grapheme cluster in +self+.
12344 * - #each_line:: Calls the given block with each successive line in +self+,
12345 * as determined by a given record separator.
12346 * - #upto:: Calls the given block with each string value returned by successive calls to #succ.
12349 void
12350 Init_String(void)
12352 rb_cString = rb_define_class("String", rb_cObject);
12353 assert(rb_vm_fstring_table());
12354 st_foreach(rb_vm_fstring_table(), fstring_set_class_i, rb_cString);
12355 rb_include_module(rb_cString, rb_mComparable);
12356 rb_define_alloc_func(rb_cString, empty_str_alloc);
12357 rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1);
12358 rb_define_method(rb_cString, "initialize", rb_str_init, -1);
12359 rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
12360 rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1);
12361 rb_define_method(rb_cString, "==", rb_str_equal, 1);
12362 rb_define_method(rb_cString, "===", rb_str_equal, 1);
12363 rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
12364 rb_define_method(rb_cString, "hash", rb_str_hash_m, 0);
12365 rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
12366 rb_define_method(rb_cString, "casecmp?", rb_str_casecmp_p, 1);
12367 rb_define_method(rb_cString, "+", rb_str_plus, 1);
12368 rb_define_method(rb_cString, "*", rb_str_times, 1);
12369 rb_define_method(rb_cString, "%", rb_str_format_m, 1);
12370 rb_define_method(rb_cString, "[]", rb_str_aref_m, -1);
12371 rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1);
12372 rb_define_method(rb_cString, "insert", rb_str_insert, 2);
12373 rb_define_method(rb_cString, "length", rb_str_length, 0);
12374 rb_define_method(rb_cString, "size", rb_str_length, 0);
12375 rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
12376 rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
12377 rb_define_method(rb_cString, "=~", rb_str_match, 1);
12378 rb_define_method(rb_cString, "match", rb_str_match_m, -1);
12379 rb_define_method(rb_cString, "match?", rb_str_match_m_p, -1);
12380 rb_define_method(rb_cString, "succ", rb_str_succ, 0);
12381 rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0);
12382 rb_define_method(rb_cString, "next", rb_str_succ, 0);
12383 rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0);
12384 rb_define_method(rb_cString, "upto", rb_str_upto, -1);
12385 rb_define_method(rb_cString, "index", rb_str_index_m, -1);
12386 rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1);
12387 rb_define_method(rb_cString, "replace", rb_str_replace, 1);
12388 rb_define_method(rb_cString, "clear", rb_str_clear, 0);
12389 rb_define_method(rb_cString, "chr", rb_str_chr, 0);
12390 rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
12391 rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
12392 rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1);
12393 rb_define_method(rb_cString, "scrub", str_scrub, -1);
12394 rb_define_method(rb_cString, "scrub!", str_scrub_bang, -1);
12395 rb_define_method(rb_cString, "freeze", rb_str_freeze, 0);
12396 rb_define_method(rb_cString, "+@", str_uplus, 0);
12397 rb_define_method(rb_cString, "-@", str_uminus, 0);
12399 rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
12400 rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
12401 rb_define_method(rb_cString, "to_s", rb_str_to_s, 0);
12402 rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
12403 rb_define_method(rb_cString, "inspect", rb_str_inspect, 0);
12404 rb_define_method(rb_cString, "dump", rb_str_dump, 0);
12405 rb_define_method(rb_cString, "undump", str_undump, 0);
12407 sym_ascii = ID2SYM(rb_intern_const("ascii"));
12408 sym_turkic = ID2SYM(rb_intern_const("turkic"));
12409 sym_lithuanian = ID2SYM(rb_intern_const("lithuanian"));
12410 sym_fold = ID2SYM(rb_intern_const("fold"));
12412 rb_define_method(rb_cString, "upcase", rb_str_upcase, -1);
12413 rb_define_method(rb_cString, "downcase", rb_str_downcase, -1);
12414 rb_define_method(rb_cString, "capitalize", rb_str_capitalize, -1);
12415 rb_define_method(rb_cString, "swapcase", rb_str_swapcase, -1);
12417 rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, -1);
12418 rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, -1);
12419 rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, -1);
12420 rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, -1);
12422 rb_define_method(rb_cString, "hex", rb_str_hex, 0);
12423 rb_define_method(rb_cString, "oct", rb_str_oct, 0);
12424 rb_define_method(rb_cString, "split", rb_str_split_m, -1);
12425 rb_define_method(rb_cString, "lines", rb_str_lines, -1);
12426 rb_define_method(rb_cString, "bytes", rb_str_bytes, 0);
12427 rb_define_method(rb_cString, "chars", rb_str_chars, 0);
12428 rb_define_method(rb_cString, "codepoints", rb_str_codepoints, 0);
12429 rb_define_method(rb_cString, "grapheme_clusters", rb_str_grapheme_clusters, 0);
12430 rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
12431 rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
12432 rb_define_method(rb_cString, "concat", rb_str_concat_multi, -1);
12433 rb_define_method(rb_cString, "<<", rb_str_concat, 1);
12434 rb_define_method(rb_cString, "prepend", rb_str_prepend_multi, -1);
12435 rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
12436 rb_define_method(rb_cString, "intern", rb_str_intern, 0); /* in symbol.c */
12437 rb_define_method(rb_cString, "to_sym", rb_str_intern, 0); /* in symbol.c */
12438 rb_define_method(rb_cString, "ord", rb_str_ord, 0);
12440 rb_define_method(rb_cString, "include?", rb_str_include, 1);
12441 rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
12442 rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
12444 rb_define_method(rb_cString, "scan", rb_str_scan, 1);
12446 rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
12447 rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
12448 rb_define_method(rb_cString, "center", rb_str_center, -1);
12450 rb_define_method(rb_cString, "sub", rb_str_sub, -1);
12451 rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
12452 rb_define_method(rb_cString, "chop", rb_str_chop, 0);
12453 rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
12454 rb_define_method(rb_cString, "strip", rb_str_strip, 0);
12455 rb_define_method(rb_cString, "lstrip", rb_str_lstrip, 0);
12456 rb_define_method(rb_cString, "rstrip", rb_str_rstrip, 0);
12457 rb_define_method(rb_cString, "delete_prefix", rb_str_delete_prefix, 1);
12458 rb_define_method(rb_cString, "delete_suffix", rb_str_delete_suffix, 1);
12460 rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1);
12461 rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1);
12462 rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0);
12463 rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1);
12464 rb_define_method(rb_cString, "strip!", rb_str_strip_bang, 0);
12465 rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, 0);
12466 rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, 0);
12467 rb_define_method(rb_cString, "delete_prefix!", rb_str_delete_prefix_bang, 1);
12468 rb_define_method(rb_cString, "delete_suffix!", rb_str_delete_suffix_bang, 1);
12470 rb_define_method(rb_cString, "tr", rb_str_tr, 2);
12471 rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2);
12472 rb_define_method(rb_cString, "delete", rb_str_delete, -1);
12473 rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
12474 rb_define_method(rb_cString, "count", rb_str_count, -1);
12476 rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2);
12477 rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2);
12478 rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1);
12479 rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1);
12481 rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
12482 rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
12483 rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
12484 rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
12485 rb_define_method(rb_cString, "each_grapheme_cluster", rb_str_each_grapheme_cluster, 0);
12487 rb_define_method(rb_cString, "sum", rb_str_sum, -1);
12489 rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
12490 rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1);
12492 rb_define_method(rb_cString, "partition", rb_str_partition, 1);
12493 rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
12495 rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */
12496 rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
12497 rb_define_method(rb_cString, "b", rb_str_b, 0);
12498 rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
12499 rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0);
12501 /* define UnicodeNormalize module here so that we don't have to look it up */
12502 mUnicodeNormalize = rb_define_module("UnicodeNormalize");
12503 id_normalize = rb_intern_const("normalize");
12504 id_normalized_p = rb_intern_const("normalized?");
12506 rb_define_method(rb_cString, "unicode_normalize", rb_str_unicode_normalize, -1);
12507 rb_define_method(rb_cString, "unicode_normalize!", rb_str_unicode_normalize_bang, -1);
12508 rb_define_method(rb_cString, "unicode_normalized?", rb_str_unicode_normalized_p, -1);
12510 rb_fs = Qnil;
12511 rb_define_hooked_variable("$;", &rb_fs, 0, rb_fs_setter);
12512 rb_define_hooked_variable("$-F", &rb_fs, 0, rb_fs_setter);
12513 rb_gc_register_address(&rb_fs);
12515 rb_cSymbol = rb_define_class("Symbol", rb_cObject);
12516 rb_include_module(rb_cSymbol, rb_mComparable);
12517 rb_undef_alloc_func(rb_cSymbol);
12518 rb_undef_method(CLASS_OF(rb_cSymbol), "new");
12519 rb_define_singleton_method(rb_cSymbol, "all_symbols", sym_all_symbols, 0);
12521 rb_define_method(rb_cSymbol, "==", sym_equal, 1);
12522 rb_define_method(rb_cSymbol, "===", sym_equal, 1);
12523 rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
12524 rb_define_method(rb_cSymbol, "to_s", rb_sym_to_s, 0);
12525 rb_define_method(rb_cSymbol, "id2name", rb_sym_to_s, 0);
12526 rb_define_method(rb_cSymbol, "name", rb_sym2str, 0);
12527 rb_define_method(rb_cSymbol, "intern", sym_to_sym, 0);
12528 rb_define_method(rb_cSymbol, "to_sym", sym_to_sym, 0);
12529 rb_define_method(rb_cSymbol, "to_proc", rb_sym_to_proc, 0);
12530 rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
12531 rb_define_method(rb_cSymbol, "next", sym_succ, 0);
12533 rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
12534 rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
12535 rb_define_method(rb_cSymbol, "casecmp?", sym_casecmp_p, 1);
12536 rb_define_method(rb_cSymbol, "=~", sym_match, 1);
12538 rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
12539 rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
12540 rb_define_method(rb_cSymbol, "length", sym_length, 0);
12541 rb_define_method(rb_cSymbol, "size", sym_length, 0);
12542 rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
12543 rb_define_method(rb_cSymbol, "match", sym_match_m, -1);
12544 rb_define_method(rb_cSymbol, "match?", sym_match_m_p, -1);
12546 rb_define_method(rb_cSymbol, "upcase", sym_upcase, -1);
12547 rb_define_method(rb_cSymbol, "downcase", sym_downcase, -1);
12548 rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, -1);
12549 rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, -1);
12551 rb_define_method(rb_cSymbol, "start_with?", sym_start_with, -1);
12552 rb_define_method(rb_cSymbol, "end_with?", sym_end_with, -1);
12554 rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);