1 /**********************************************************************
6 created at: Mon Aug 9 17:12:58 JST 1993
8 Copyright (C) 1993-2007 Yukihiro Matsumoto
9 Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
10 Copyright (C) 2000 Information-technology Promotion Agency, Japan
12 **********************************************************************/
14 #include "ruby/internal/config.h"
24 #include "debug_counter.h"
29 #include "internal/array.h"
30 #include "internal/compar.h"
31 #include "internal/compilers.h"
32 #include "internal/encoding.h"
33 #include "internal/error.h"
34 #include "internal/gc.h"
35 #include "internal/numeric.h"
36 #include "internal/object.h"
37 #include "internal/proc.h"
38 #include "internal/re.h"
39 #include "internal/sanitizers.h"
40 #include "internal/string.h"
41 #include "internal/transcode.h"
43 #include "ruby/encoding.h"
45 #include "ruby/util.h"
46 #include "ruby_assert.h"
49 #if defined HAVE_CRYPT_R
50 # if defined HAVE_CRYPT_H
53 #elif !defined HAVE_CRYPT
54 # include "missing/crypt.h"
55 # define HAVE_CRYPT_R 1
58 #define BEG(no) (regs->beg[(no)])
59 #define END(no) (regs->end[(no)])
62 #undef rb_usascii_str_new
63 #undef rb_utf8_str_new
65 #undef rb_str_new_cstr
66 #undef rb_usascii_str_new_cstr
67 #undef rb_utf8_str_new_cstr
68 #undef rb_enc_str_new_cstr
69 #undef rb_external_str_new_cstr
70 #undef rb_locale_str_new_cstr
71 #undef rb_str_dup_frozen
72 #undef rb_str_buf_new_cstr
74 #undef rb_str_buf_cat2
76 #undef rb_str_cat_cstr
77 #undef rb_fstring_cstr
85 * 2: STR_SHARED (== ELTS_SHARED)
86 * 2-6: RSTRING_EMBED_LEN (5 bits == 32)
87 * 5: STR_SHARED_ROOT (RSTRING_NOEMBED==1 && STR_SHARED == 0, there may be
88 * other strings that rely on this string's buffer)
89 * 6: STR_BORROWED (when RSTRING_NOEMBED==1 && klass==0, unsafe to recycle
90 * early, specific to rb_str_tmp_frozen_{acquire,release})
91 * 7: STR_TMPLOCK (set when a pointer to the buffer is passed to syscall
92 * such as read(2). Any modification and realloc is prohibited)
94 * 8-9: ENC_CODERANGE (2 bits)
95 * 10-16: ENCODING (7 bits == 128)
97 * 18: STR_NOFREE (do not free this string's buffer when a String is freed.
98 * used for a string object based on C string literal)
99 * 19: STR_FAKESTR (when RVALUE is not managed by GC. Typically, the string
100 * object header is temporarily allocated on C stack)
103 #define RUBY_MAX_CHAR_LEN 16
104 #define STR_SHARED_ROOT FL_USER5
105 #define STR_BORROWED FL_USER6
106 #define STR_TMPLOCK FL_USER7
107 #define STR_NOFREE FL_USER18
108 #define STR_FAKESTR FL_USER19
110 #define STR_SET_NOEMBED(str) do {\
111 FL_SET((str), STR_NOEMBED);\
113 FL_UNSET((str), STR_SHARED | STR_SHARED_ROOT | STR_BORROWED);\
116 STR_SET_EMBED_LEN((str), 0);\
119 #define STR_SET_EMBED(str) FL_UNSET((str), (STR_NOEMBED|STR_NOFREE))
121 # define STR_SET_EMBED_LEN(str, n) do { \
122 assert(str_embed_capa(str) > (n));\
123 RSTRING(str)->as.embed.len = (n);\
126 # define STR_SET_EMBED_LEN(str, n) do { \
128 RBASIC(str)->flags &= ~RSTRING_EMBED_LEN_MASK;\
129 RBASIC(str)->flags |= (tmp_n) << RSTRING_EMBED_LEN_SHIFT;\
133 #define STR_SET_LEN(str, n) do { \
134 if (STR_EMBED_P(str)) {\
135 STR_SET_EMBED_LEN((str), (n));\
138 RSTRING(str)->as.heap.len = (n);\
142 #define STR_DEC_LEN(str) do {\
143 if (STR_EMBED_P(str)) {\
144 long n = RSTRING_LEN(str);\
146 STR_SET_EMBED_LEN((str), n);\
149 RSTRING(str)->as.heap.len--;\
153 #define TERM_LEN(str) rb_enc_mbminlen(rb_enc_get(str))
154 #define TERM_FILL(ptr, termlen) do {\
155 char *const term_fill_ptr = (ptr);\
156 const int term_fill_len = (termlen);\
157 *term_fill_ptr = '\0';\
158 if (UNLIKELY(term_fill_len > 1))\
159 memset(term_fill_ptr, 0, term_fill_len);\
162 #define RESIZE_CAPA(str,capacity) do {\
163 const int termlen = TERM_LEN(str);\
164 RESIZE_CAPA_TERM(str,capacity,termlen);\
166 #define RESIZE_CAPA_TERM(str,capacity,termlen) do {\
167 if (STR_EMBED_P(str)) {\
168 if (str_embed_capa(str) < capacity + termlen) {\
169 char *const tmp = ALLOC_N(char, (size_t)(capacity) + (termlen));\
170 const long tlen = RSTRING_LEN(str);\
171 memcpy(tmp, RSTRING_PTR(str), tlen);\
172 RSTRING(str)->as.heap.ptr = tmp;\
173 RSTRING(str)->as.heap.len = tlen;\
174 STR_SET_NOEMBED(str);\
175 RSTRING(str)->as.heap.aux.capa = (capacity);\
179 assert(!FL_TEST((str), STR_SHARED)); \
180 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char, \
181 (size_t)(capacity) + (termlen), STR_HEAP_SIZE(str)); \
182 RSTRING(str)->as.heap.aux.capa = (capacity);\
186 #define STR_SET_SHARED(str, shared_str) do { \
187 if (!FL_TEST(str, STR_FAKESTR)) { \
188 assert(RSTRING_PTR(shared_str) <= RSTRING_PTR(str)); \
189 assert(RSTRING_PTR(str) <= RSTRING_PTR(shared_str) + RSTRING_LEN(shared_str)); \
190 RB_OBJ_WRITE((str), &RSTRING(str)->as.heap.aux.shared, (shared_str)); \
191 FL_SET((str), STR_SHARED); \
192 FL_SET((shared_str), STR_SHARED_ROOT); \
193 if (RBASIC_CLASS((shared_str)) == 0) /* for CoW-friendliness */ \
194 FL_SET_RAW((shared_str), STR_BORROWED); \
198 #define STR_HEAP_PTR(str) (RSTRING(str)->as.heap.ptr)
199 #define STR_HEAP_SIZE(str) ((size_t)RSTRING(str)->as.heap.aux.capa + TERM_LEN(str))
200 /* TODO: include the terminator size in capa. */
202 #define STR_ENC_GET(str) get_encoding(str)
204 #if !defined SHARABLE_MIDDLE_SUBSTRING
205 # define SHARABLE_MIDDLE_SUBSTRING 0
207 #if !SHARABLE_MIDDLE_SUBSTRING
208 #define SHARABLE_SUBSTRING_P(beg, len, end) ((beg) + (len) == (end))
210 #define SHARABLE_SUBSTRING_P(beg, len, end) 1
215 str_embed_capa(VALUE str
)
218 return rb_gc_obj_slot_size(str
) - offsetof(struct RString
, as
.embed
.ary
);
220 return RSTRING_EMBED_LEN_MAX
+ 1;
225 str_embed_size(long capa
)
227 return offsetof(struct RString
, as
.embed
.ary
) + capa
;
231 STR_EMBEDDABLE_P(long len
, long termlen
)
234 return rb_gc_size_allocatable_p(str_embed_size(len
+ termlen
));
236 return len
<= RSTRING_EMBED_LEN_MAX
+ 1 - termlen
;
240 static VALUE
str_replace_shared_without_enc(VALUE str2
, VALUE str
);
241 static VALUE
str_new_frozen(VALUE klass
, VALUE orig
);
242 static VALUE
str_new_frozen_buffer(VALUE klass
, VALUE orig
, int copy_encoding
);
243 static VALUE
str_new_static(VALUE klass
, const char *ptr
, long len
, int encindex
);
244 static VALUE
str_new(VALUE klass
, const char *ptr
, long len
);
245 static void str_make_independent_expand(VALUE str
, long len
, long expand
, const int termlen
);
246 static inline void str_modifiable(VALUE str
);
247 static VALUE
rb_str_downcase(int argc
, VALUE
*argv
, VALUE str
);
250 str_make_independent(VALUE str
)
252 long len
= RSTRING_LEN(str
);
253 int termlen
= TERM_LEN(str
);
254 str_make_independent_expand((str
), len
, 0L, termlen
);
257 static inline int str_dependent_p(VALUE str
);
260 rb_str_make_independent(VALUE str
)
262 if (str_dependent_p(str
)) {
263 str_make_independent(str
);
268 rb_debug_rstring_null_ptr(const char *func
)
270 fprintf(stderr
, "%s is returning NULL!! "
271 "SIGSEGV is highly expected to follow immediately. "
272 "If you could reproduce, attach your debugger here, "
273 "and look at the passed string.",
277 /* symbols for [up|down|swap]case/capitalize options */
278 static VALUE sym_ascii
, sym_turkic
, sym_lithuanian
, sym_fold
;
281 get_actual_encoding(const int encidx
, VALUE str
)
283 const unsigned char *q
;
286 case ENCINDEX_UTF_16
:
287 if (RSTRING_LEN(str
) < 2) break;
288 q
= (const unsigned char *)RSTRING_PTR(str
);
289 if (q
[0] == 0xFE && q
[1] == 0xFF) {
290 return rb_enc_get_from_index(ENCINDEX_UTF_16BE
);
292 if (q
[0] == 0xFF && q
[1] == 0xFE) {
293 return rb_enc_get_from_index(ENCINDEX_UTF_16LE
);
295 return rb_ascii8bit_encoding();
296 case ENCINDEX_UTF_32
:
297 if (RSTRING_LEN(str
) < 4) break;
298 q
= (const unsigned char *)RSTRING_PTR(str
);
299 if (q
[0] == 0 && q
[1] == 0 && q
[2] == 0xFE && q
[3] == 0xFF) {
300 return rb_enc_get_from_index(ENCINDEX_UTF_32BE
);
302 if (q
[3] == 0 && q
[2] == 0 && q
[1] == 0xFE && q
[0] == 0xFF) {
303 return rb_enc_get_from_index(ENCINDEX_UTF_32LE
);
305 return rb_ascii8bit_encoding();
307 return rb_enc_from_index(encidx
);
311 get_encoding(VALUE str
)
313 return get_actual_encoding(ENCODING_GET(str
), str
);
317 mustnot_broken(VALUE str
)
319 if (is_broken_string(str
)) {
320 rb_raise(rb_eArgError
, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str
)));
325 mustnot_wchar(VALUE str
)
327 rb_encoding
*enc
= STR_ENC_GET(str
);
328 if (rb_enc_mbminlen(enc
) > 1) {
329 rb_raise(rb_eArgError
, "wide char encoding: %s", rb_enc_name(enc
));
333 static int fstring_cmp(VALUE a
, VALUE b
);
335 static VALUE
register_fstring(VALUE str
, bool copy
);
337 const struct st_hash_type rb_fstring_hash_type
= {
342 #define BARE_STRING_P(str) (!FL_ANY_RAW(str, FL_EXIVAR) && RBASIC_CLASS(str) == rb_cString)
344 struct fstr_update_arg
{
350 fstr_update_callback(st_data_t
*key
, st_data_t
*value
, st_data_t data
, int existing
)
353 struct fstr_update_arg
*arg
= (struct fstr_update_arg
*)data
;
354 VALUE str
= (VALUE
)*key
;
357 /* because of lazy sweep, str may be unmarked already and swept
360 if (rb_objspace_garbage_object_p(str
)) {
369 if (FL_TEST_RAW(str
, STR_FAKESTR
)) {
371 VALUE new_str
= str_new(rb_cString
, RSTRING(str
)->as
.heap
.ptr
, RSTRING(str
)->as
.heap
.len
);
372 rb_enc_copy(new_str
, str
);
376 str
= str_new_static(rb_cString
, RSTRING(str
)->as
.heap
.ptr
,
377 RSTRING(str
)->as
.heap
.len
,
383 if (!OBJ_FROZEN(str
))
384 str
= str_new_frozen(rb_cString
, str
);
385 if (STR_SHARED_P(str
)) { /* str should not be shared */
386 /* shared substring */
387 str_make_independent(str
);
388 assert(OBJ_FROZEN(str
));
390 if (!BARE_STRING_P(str
)) {
391 str
= str_new_frozen(rb_cString
, str
);
394 RBASIC(str
)->flags
|= RSTRING_FSTR
;
396 *key
= *value
= arg
->fstr
= str
;
403 rb_fstring(VALUE str
)
408 Check_Type(str
, T_STRING
);
410 if (FL_TEST(str
, RSTRING_FSTR
))
413 bare
= BARE_STRING_P(str
);
415 if (STR_EMBED_P(str
)) {
419 if (FL_TEST_RAW(str
, STR_NOEMBED
|STR_SHARED_ROOT
|STR_SHARED
) == (STR_NOEMBED
|STR_SHARED_ROOT
)) {
420 assert(OBJ_FROZEN(str
));
425 if (!OBJ_FROZEN(str
))
426 rb_str_resize(str
, RSTRING_LEN(str
));
428 fstr
= register_fstring(str
, FALSE
);
431 str_replace_shared_without_enc(str
, fstr
);
439 register_fstring(VALUE str
, bool copy
)
441 struct fstr_update_arg args
;
446 st_table
*frozen_strings
= rb_vm_fstring_table();
449 st_update(frozen_strings
, (st_data_t
)str
, fstr_update_callback
, (st_data_t
)&args
);
450 } while (args
.fstr
== Qundef
);
454 assert(OBJ_FROZEN(args
.fstr
));
455 assert(!FL_TEST_RAW(args
.fstr
, STR_FAKESTR
));
456 assert(!FL_TEST_RAW(args
.fstr
, FL_EXIVAR
));
457 assert(RBASIC_CLASS(args
.fstr
) == rb_cString
);
462 setup_fake_str(struct RString
*fake_str
, const char *name
, long len
, int encidx
)
464 fake_str
->basic
.flags
= T_STRING
|RSTRING_NOEMBED
|STR_NOFREE
|STR_FAKESTR
;
465 /* SHARED to be allocated by the callback */
468 RUBY_ASSERT_ALWAYS(len
== 0);
472 ENCODING_SET_INLINED((VALUE
)fake_str
, encidx
);
474 RBASIC_SET_CLASS_RAW((VALUE
)fake_str
, rb_cString
);
475 fake_str
->as
.heap
.len
= len
;
476 fake_str
->as
.heap
.ptr
= (char *)name
;
477 fake_str
->as
.heap
.aux
.capa
= len
;
478 return (VALUE
)fake_str
;
482 * set up a fake string which refers a static string literal.
485 rb_setup_fake_str(struct RString
*fake_str
, const char *name
, long len
, rb_encoding
*enc
)
487 return setup_fake_str(fake_str
, name
, len
, rb_enc_to_index(enc
));
491 * rb_fstring_new and rb_fstring_cstr family create or lookup a frozen
492 * shared string which refers a static string literal. `ptr` must
493 * point a constant string.
495 MJIT_FUNC_EXPORTED VALUE
496 rb_fstring_new(const char *ptr
, long len
)
498 struct RString fake_str
;
499 return register_fstring(setup_fake_str(&fake_str
, ptr
, len
, ENCINDEX_US_ASCII
), FALSE
);
503 rb_fstring_enc_new(const char *ptr
, long len
, rb_encoding
*enc
)
505 struct RString fake_str
;
506 return register_fstring(rb_setup_fake_str(&fake_str
, ptr
, len
, enc
), FALSE
);
510 rb_fstring_cstr(const char *ptr
)
512 return rb_fstring_new(ptr
, strlen(ptr
));
516 fstring_set_class_i(st_data_t key
, st_data_t val
, st_data_t arg
)
518 RBASIC_SET_CLASS((VALUE
)key
, (VALUE
)arg
);
523 fstring_cmp(VALUE a
, VALUE b
)
526 const char *aptr
, *bptr
;
527 RSTRING_GETMEM(a
, aptr
, alen
);
528 RSTRING_GETMEM(b
, bptr
, blen
);
529 return (alen
!= blen
||
530 ENCODING_GET(a
) != ENCODING_GET(b
) ||
531 memcmp(aptr
, bptr
, alen
) != 0);
535 single_byte_optimizable(VALUE str
)
539 /* Conservative. It may be ENC_CODERANGE_UNKNOWN. */
540 if (ENC_CODERANGE(str
) == ENC_CODERANGE_7BIT
)
543 enc
= STR_ENC_GET(str
);
544 if (rb_enc_mbmaxlen(enc
) == 1)
547 /* Conservative. Possibly single byte.
548 * "\xa1" in Shift_JIS for example. */
554 static inline const char *
555 search_nonascii(const char *p
, const char *e
)
557 const uintptr_t *s
, *t
;
559 #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
560 # if SIZEOF_UINTPTR_T == 8
561 # define NONASCII_MASK UINT64_C(0x8080808080808080)
562 # elif SIZEOF_UINTPTR_T == 4
563 # define NONASCII_MASK UINT32_C(0x80808080)
565 # error "don't know what to do."
568 # if SIZEOF_UINTPTR_T == 8
569 # define NONASCII_MASK ((uintptr_t)0x80808080UL << 32 | (uintptr_t)0x80808080UL)
570 # elif SIZEOF_UINTPTR_T == 4
571 # define NONASCII_MASK 0x80808080UL /* or...? */
573 # error "don't know what to do."
577 if (UNALIGNED_WORD_ACCESS
|| e
- p
>= SIZEOF_VOIDP
) {
578 #if !UNALIGNED_WORD_ACCESS
579 if ((uintptr_t)p
% SIZEOF_VOIDP
) {
580 int l
= SIZEOF_VOIDP
- (uintptr_t)p
% SIZEOF_VOIDP
;
583 default: UNREACHABLE
;
585 case 7: if (p
[-7]&0x80) return p
-7;
586 case 6: if (p
[-6]&0x80) return p
-6;
587 case 5: if (p
[-5]&0x80) return p
-5;
588 case 4: if (p
[-4]&0x80) return p
-4;
590 case 3: if (p
[-3]&0x80) return p
-3;
591 case 2: if (p
[-2]&0x80) return p
-2;
592 case 1: if (p
[-1]&0x80) return p
-1;
597 #if defined(HAVE_BUILTIN___BUILTIN_ASSUME_ALIGNED) &&! UNALIGNED_WORD_ACCESS
598 #define aligned_ptr(value) \
599 __builtin_assume_aligned((value), sizeof(uintptr_t))
601 #define aligned_ptr(value) (uintptr_t *)(value)
604 t
= (uintptr_t *)(e
- (SIZEOF_VOIDP
-1));
607 if (*s
& NONASCII_MASK
) {
608 #ifdef WORDS_BIGENDIAN
609 return (const char *)s
+ (nlz_intptr(*s
&NONASCII_MASK
)>>3);
611 return (const char *)s
+ (ntz_intptr(*s
&NONASCII_MASK
)>>3);
619 default: UNREACHABLE
;
621 case 7: if (e
[-7]&0x80) return e
-7;
622 case 6: if (e
[-6]&0x80) return e
-6;
623 case 5: if (e
[-5]&0x80) return e
-5;
624 case 4: if (e
[-4]&0x80) return e
-4;
626 case 3: if (e
[-3]&0x80) return e
-3;
627 case 2: if (e
[-2]&0x80) return e
-2;
628 case 1: if (e
[-1]&0x80) return e
-1;
634 coderange_scan(const char *p
, long len
, rb_encoding
*enc
)
636 const char *e
= p
+ len
;
638 if (rb_enc_to_index(enc
) == rb_ascii8bit_encindex()) {
639 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
640 p
= search_nonascii(p
, e
);
641 return p
? ENC_CODERANGE_VALID
: ENC_CODERANGE_7BIT
;
644 if (rb_enc_asciicompat(enc
)) {
645 p
= search_nonascii(p
, e
);
646 if (!p
) return ENC_CODERANGE_7BIT
;
648 int ret
= rb_enc_precise_mbclen(p
, e
, enc
);
649 if (!MBCLEN_CHARFOUND_P(ret
)) return ENC_CODERANGE_BROKEN
;
650 p
+= MBCLEN_CHARFOUND_LEN(ret
);
652 p
= search_nonascii(p
, e
);
658 int ret
= rb_enc_precise_mbclen(p
, e
, enc
);
659 if (!MBCLEN_CHARFOUND_P(ret
)) return ENC_CODERANGE_BROKEN
;
660 p
+= MBCLEN_CHARFOUND_LEN(ret
);
663 return ENC_CODERANGE_VALID
;
667 rb_str_coderange_scan_restartable(const char *s
, const char *e
, rb_encoding
*enc
, int *cr
)
671 if (*cr
== ENC_CODERANGE_BROKEN
)
674 if (rb_enc_to_index(enc
) == rb_ascii8bit_encindex()) {
675 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
676 if (*cr
== ENC_CODERANGE_VALID
) return e
- s
;
677 p
= search_nonascii(p
, e
);
678 *cr
= p
? ENC_CODERANGE_VALID
: ENC_CODERANGE_7BIT
;
681 else if (rb_enc_asciicompat(enc
)) {
682 p
= search_nonascii(p
, e
);
684 if (*cr
!= ENC_CODERANGE_VALID
) *cr
= ENC_CODERANGE_7BIT
;
688 int ret
= rb_enc_precise_mbclen(p
, e
, enc
);
689 if (!MBCLEN_CHARFOUND_P(ret
)) {
690 *cr
= MBCLEN_INVALID_P(ret
) ? ENC_CODERANGE_BROKEN
: ENC_CODERANGE_UNKNOWN
;
693 p
+= MBCLEN_CHARFOUND_LEN(ret
);
695 p
= search_nonascii(p
, e
);
701 int ret
= rb_enc_precise_mbclen(p
, e
, enc
);
702 if (!MBCLEN_CHARFOUND_P(ret
)) {
703 *cr
= MBCLEN_INVALID_P(ret
) ? ENC_CODERANGE_BROKEN
: ENC_CODERANGE_UNKNOWN
;
706 p
+= MBCLEN_CHARFOUND_LEN(ret
);
709 *cr
= ENC_CODERANGE_VALID
;
714 str_enc_copy(VALUE str1
, VALUE str2
)
716 rb_enc_set_index(str1
, ENCODING_GET(str2
));
720 rb_enc_cr_str_copy_for_substr(VALUE dest
, VALUE src
)
722 /* this function is designed for copying encoding and coderange
723 * from src to new string "dest" which is made from the part of src.
725 str_enc_copy(dest
, src
);
726 if (RSTRING_LEN(dest
) == 0) {
727 if (!rb_enc_asciicompat(STR_ENC_GET(src
)))
728 ENC_CODERANGE_SET(dest
, ENC_CODERANGE_VALID
);
730 ENC_CODERANGE_SET(dest
, ENC_CODERANGE_7BIT
);
733 switch (ENC_CODERANGE(src
)) {
734 case ENC_CODERANGE_7BIT
:
735 ENC_CODERANGE_SET(dest
, ENC_CODERANGE_7BIT
);
737 case ENC_CODERANGE_VALID
:
738 if (!rb_enc_asciicompat(STR_ENC_GET(src
)) ||
739 search_nonascii(RSTRING_PTR(dest
), RSTRING_END(dest
)))
740 ENC_CODERANGE_SET(dest
, ENC_CODERANGE_VALID
);
742 ENC_CODERANGE_SET(dest
, ENC_CODERANGE_7BIT
);
750 rb_enc_cr_str_exact_copy(VALUE dest
, VALUE src
)
752 str_enc_copy(dest
, src
);
753 ENC_CODERANGE_SET(dest
, ENC_CODERANGE(src
));
757 enc_coderange_scan(VALUE str
, rb_encoding
*enc
, int encidx
)
759 if (rb_enc_mbminlen(enc
) > 1 && rb_enc_dummy_p(enc
) &&
760 rb_enc_mbminlen(enc
= get_actual_encoding(encidx
, str
)) == 1) {
761 return ENC_CODERANGE_BROKEN
;
764 return coderange_scan(RSTRING_PTR(str
), RSTRING_LEN(str
), enc
);
769 rb_enc_str_coderange_scan(VALUE str
, rb_encoding
*enc
)
771 return enc_coderange_scan(str
, enc
, rb_enc_to_index(enc
));
775 rb_enc_str_coderange(VALUE str
)
777 int cr
= ENC_CODERANGE(str
);
779 if (cr
== ENC_CODERANGE_UNKNOWN
) {
780 int encidx
= ENCODING_GET(str
);
781 rb_encoding
*enc
= rb_enc_from_index(encidx
);
782 cr
= enc_coderange_scan(str
, enc
, encidx
);
783 ENC_CODERANGE_SET(str
, cr
);
789 rb_enc_str_asciionly_p(VALUE str
)
791 rb_encoding
*enc
= STR_ENC_GET(str
);
793 if (!rb_enc_asciicompat(enc
))
795 else if (rb_enc_str_coderange(str
) == ENC_CODERANGE_7BIT
)
801 str_mod_check(VALUE s
, const char *p
, long len
)
803 if (RSTRING_PTR(s
) != p
|| RSTRING_LEN(s
) != len
){
804 rb_raise(rb_eRuntimeError
, "string modified");
809 str_capacity(VALUE str
, const int termlen
)
811 if (STR_EMBED_P(str
)) {
813 return str_embed_capa(str
) - termlen
;
815 return (RSTRING_EMBED_LEN_MAX
+ 1 - termlen
);
818 else if (FL_TEST(str
, STR_SHARED
|STR_NOFREE
)) {
819 return RSTRING(str
)->as
.heap
.len
;
822 return RSTRING(str
)->as
.heap
.aux
.capa
;
827 rb_str_capacity(VALUE str
)
829 return str_capacity(str
, TERM_LEN(str
));
833 must_not_null(const char *ptr
)
836 rb_raise(rb_eArgError
, "NULL pointer given");
841 str_alloc(VALUE klass
, size_t size
)
844 RVARGC_NEWOBJ_OF(str
, struct RString
, klass
,
845 T_STRING
| (RGENGC_WB_PROTECTED_STRING
? FL_WB_PROTECTED
: 0), size
);
850 str_alloc_embed(VALUE klass
, size_t capa
)
852 size_t size
= str_embed_size(capa
);
853 assert(rb_gc_size_allocatable_p(size
));
855 assert(size
<= sizeof(struct RString
));
857 return str_alloc(klass
, size
);
861 str_alloc_heap(VALUE klass
)
863 return str_alloc(klass
, sizeof(struct RString
));
867 empty_str_alloc(VALUE klass
)
869 RUBY_DTRACE_CREATE_HOOK(STRING
, 0);
870 VALUE str
= str_alloc_embed(klass
, 0);
871 memset(RSTRING(str
)->as
.embed
.ary
, 0, str_embed_capa(str
));
876 str_new0(VALUE klass
, const char *ptr
, long len
, int termlen
)
881 rb_raise(rb_eArgError
, "negative string size (or size too big)");
884 RUBY_DTRACE_CREATE_HOOK(STRING
, len
);
886 if (STR_EMBEDDABLE_P(len
, termlen
)) {
887 str
= str_alloc_embed(klass
, len
+ termlen
);
889 ENC_CODERANGE_SET(str
, ENC_CODERANGE_7BIT
);
893 str
= str_alloc_heap(klass
);
894 RSTRING(str
)->as
.heap
.aux
.capa
= len
;
895 /* :FIXME: @shyouhei guesses `len + termlen` is guaranteed to never
896 * integer overflow. If we can STATIC_ASSERT that, the following
897 * mul_add_mul can be reverted to a simple ALLOC_N. */
898 RSTRING(str
)->as
.heap
.ptr
=
899 rb_xmalloc_mul_add_mul(sizeof(char), len
, sizeof(char), termlen
);
900 STR_SET_NOEMBED(str
);
903 memcpy(RSTRING_PTR(str
), ptr
, len
);
905 STR_SET_LEN(str
, len
);
906 TERM_FILL(RSTRING_PTR(str
) + len
, termlen
);
911 str_new(VALUE klass
, const char *ptr
, long len
)
913 return str_new0(klass
, ptr
, len
, 1);
917 rb_str_new(const char *ptr
, long len
)
919 return str_new(rb_cString
, ptr
, len
);
923 rb_usascii_str_new(const char *ptr
, long len
)
925 VALUE str
= rb_str_new(ptr
, len
);
926 ENCODING_CODERANGE_SET(str
, rb_usascii_encindex(), ENC_CODERANGE_7BIT
);
931 rb_utf8_str_new(const char *ptr
, long len
)
933 VALUE str
= str_new(rb_cString
, ptr
, len
);
934 rb_enc_associate_index(str
, rb_utf8_encindex());
939 rb_enc_str_new(const char *ptr
, long len
, rb_encoding
*enc
)
943 if (!enc
) return rb_str_new(ptr
, len
);
945 str
= str_new0(rb_cString
, ptr
, len
, rb_enc_mbminlen(enc
));
946 rb_enc_associate(str
, enc
);
951 rb_str_new_cstr(const char *ptr
)
954 /* rb_str_new_cstr() can take pointer from non-malloc-generated
955 * memory regions, and that cannot be detected by the MSAN. Just
956 * trust the programmer that the argument passed here is a sane C
958 __msan_unpoison_string(ptr
);
959 return rb_str_new(ptr
, strlen(ptr
));
963 rb_usascii_str_new_cstr(const char *ptr
)
965 VALUE str
= rb_str_new_cstr(ptr
);
966 ENCODING_CODERANGE_SET(str
, rb_usascii_encindex(), ENC_CODERANGE_7BIT
);
971 rb_utf8_str_new_cstr(const char *ptr
)
973 VALUE str
= rb_str_new_cstr(ptr
);
974 rb_enc_associate_index(str
, rb_utf8_encindex());
979 rb_enc_str_new_cstr(const char *ptr
, rb_encoding
*enc
)
982 if (rb_enc_mbminlen(enc
) != 1) {
983 rb_raise(rb_eArgError
, "wchar encoding given");
985 return rb_enc_str_new(ptr
, strlen(ptr
), enc
);
989 str_new_static(VALUE klass
, const char *ptr
, long len
, int encindex
)
994 rb_raise(rb_eArgError
, "negative string size (or size too big)");
998 rb_encoding
*enc
= rb_enc_get_from_index(encindex
);
999 str
= str_new0(klass
, ptr
, len
, rb_enc_mbminlen(enc
));
1002 RUBY_DTRACE_CREATE_HOOK(STRING
, len
);
1003 str
= str_alloc_heap(klass
);
1004 RSTRING(str
)->as
.heap
.len
= len
;
1005 RSTRING(str
)->as
.heap
.ptr
= (char *)ptr
;
1006 RSTRING(str
)->as
.heap
.aux
.capa
= len
;
1007 STR_SET_NOEMBED(str
);
1008 RBASIC(str
)->flags
|= STR_NOFREE
;
1010 rb_enc_associate_index(str
, encindex
);
1015 rb_str_new_static(const char *ptr
, long len
)
1017 return str_new_static(rb_cString
, ptr
, len
, 0);
1021 rb_usascii_str_new_static(const char *ptr
, long len
)
1023 return str_new_static(rb_cString
, ptr
, len
, ENCINDEX_US_ASCII
);
1027 rb_utf8_str_new_static(const char *ptr
, long len
)
1029 return str_new_static(rb_cString
, ptr
, len
, ENCINDEX_UTF_8
);
1033 rb_enc_str_new_static(const char *ptr
, long len
, rb_encoding
*enc
)
1035 return str_new_static(rb_cString
, ptr
, len
, rb_enc_to_index(enc
));
1038 static VALUE
str_cat_conv_enc_opts(VALUE newstr
, long ofs
, const char *ptr
, long len
,
1039 rb_encoding
*from
, rb_encoding
*to
,
1040 int ecflags
, VALUE ecopts
);
1043 is_enc_ascii_string(VALUE str
, rb_encoding
*enc
)
1045 int encidx
= rb_enc_to_index(enc
);
1046 if (rb_enc_get_index(str
) == encidx
)
1047 return is_ascii_string(str
);
1048 return enc_coderange_scan(str
, enc
, encidx
) == ENC_CODERANGE_7BIT
;
1052 rb_str_conv_enc_opts(VALUE str
, rb_encoding
*from
, rb_encoding
*to
, int ecflags
, VALUE ecopts
)
1058 if (!to
) return str
;
1059 if (!from
) from
= rb_enc_get(str
);
1060 if (from
== to
) return str
;
1061 if ((rb_enc_asciicompat(to
) && is_enc_ascii_string(str
, from
)) ||
1062 to
== rb_ascii8bit_encoding()) {
1063 if (STR_ENC_GET(str
) != to
) {
1064 str
= rb_str_dup(str
);
1065 rb_enc_associate(str
, to
);
1070 RSTRING_GETMEM(str
, ptr
, len
);
1071 newstr
= str_cat_conv_enc_opts(rb_str_buf_new(len
), 0, ptr
, len
,
1072 from
, to
, ecflags
, ecopts
);
1073 if (NIL_P(newstr
)) {
1074 /* some error, return original */
1081 rb_str_cat_conv_enc_opts(VALUE newstr
, long ofs
, const char *ptr
, long len
,
1082 rb_encoding
*from
, int ecflags
, VALUE ecopts
)
1086 olen
= RSTRING_LEN(newstr
);
1087 if (ofs
< -olen
|| olen
< ofs
)
1088 rb_raise(rb_eIndexError
, "index %ld out of string", ofs
);
1089 if (ofs
< 0) ofs
+= olen
;
1091 STR_SET_LEN(newstr
, ofs
);
1092 return rb_str_cat(newstr
, ptr
, len
);
1095 rb_str_modify(newstr
);
1096 return str_cat_conv_enc_opts(newstr
, ofs
, ptr
, len
, from
,
1102 rb_str_initialize(VALUE str
, const char *ptr
, long len
, rb_encoding
*enc
)
1104 STR_SET_LEN(str
, 0);
1105 rb_enc_associate(str
, enc
);
1106 rb_str_cat(str
, ptr
, len
);
1111 str_cat_conv_enc_opts(VALUE newstr
, long ofs
, const char *ptr
, long len
,
1112 rb_encoding
*from
, rb_encoding
*to
,
1113 int ecflags
, VALUE ecopts
)
1116 rb_econv_result_t ret
;
1118 VALUE econv_wrapper
;
1119 const unsigned char *start
, *sp
;
1120 unsigned char *dest
, *dp
;
1121 size_t converted_output
= (size_t)ofs
;
1123 olen
= rb_str_capacity(newstr
);
1125 econv_wrapper
= rb_obj_alloc(rb_cEncodingConverter
);
1126 RBASIC_CLEAR_CLASS(econv_wrapper
);
1127 ec
= rb_econv_open_opts(from
->name
, to
->name
, ecflags
, ecopts
);
1128 if (!ec
) return Qnil
;
1129 DATA_PTR(econv_wrapper
) = ec
;
1131 sp
= (unsigned char*)ptr
;
1133 while ((dest
= (unsigned char*)RSTRING_PTR(newstr
)),
1134 (dp
= dest
+ converted_output
),
1135 (ret
= rb_econv_convert(ec
, &sp
, start
+ len
, &dp
, dest
+ olen
, 0)),
1136 ret
== econv_destination_buffer_full
) {
1137 /* destination buffer short */
1138 size_t converted_input
= sp
- start
;
1139 size_t rest
= len
- converted_input
;
1140 converted_output
= dp
- dest
;
1141 rb_str_set_len(newstr
, converted_output
);
1142 if (converted_input
&& converted_output
&&
1143 rest
< (LONG_MAX
/ converted_output
)) {
1144 rest
= (rest
* converted_output
) / converted_input
;
1149 olen
+= rest
< 2 ? 2 : rest
;
1150 rb_str_resize(newstr
, olen
);
1152 DATA_PTR(econv_wrapper
) = 0;
1155 case econv_finished
:
1156 len
= dp
- (unsigned char*)RSTRING_PTR(newstr
);
1157 rb_str_set_len(newstr
, len
);
1158 rb_enc_associate(newstr
, to
);
1167 rb_str_conv_enc(VALUE str
, rb_encoding
*from
, rb_encoding
*to
)
1169 return rb_str_conv_enc_opts(str
, from
, to
, 0, Qnil
);
1173 rb_external_str_new_with_enc(const char *ptr
, long len
, rb_encoding
*eenc
)
1177 const int eidx
= rb_enc_to_index(eenc
);
1180 return rb_enc_str_new(ptr
, len
, eenc
);
1183 /* ASCII-8BIT case, no conversion */
1184 if ((eidx
== rb_ascii8bit_encindex()) ||
1185 (eidx
== rb_usascii_encindex() && search_nonascii(ptr
, ptr
+ len
))) {
1186 return rb_str_new(ptr
, len
);
1188 /* no default_internal or same encoding, no conversion */
1189 ienc
= rb_default_internal_encoding();
1190 if (!ienc
|| eenc
== ienc
) {
1191 return rb_enc_str_new(ptr
, len
, eenc
);
1193 /* ASCII compatible, and ASCII only string, no conversion in
1194 * default_internal */
1195 if ((eidx
== rb_ascii8bit_encindex()) ||
1196 (eidx
== rb_usascii_encindex()) ||
1197 (rb_enc_asciicompat(eenc
) && !search_nonascii(ptr
, ptr
+ len
))) {
1198 return rb_enc_str_new(ptr
, len
, ienc
);
1200 /* convert from the given encoding to default_internal */
1201 str
= rb_enc_str_new(NULL
, 0, ienc
);
1202 /* when the conversion failed for some reason, just ignore the
1203 * default_internal and result in the given encoding as-is. */
1204 if (NIL_P(rb_str_cat_conv_enc_opts(str
, 0, ptr
, len
, eenc
, 0, Qnil
))) {
1205 rb_str_initialize(str
, ptr
, len
, eenc
);
1211 rb_external_str_with_enc(VALUE str
, rb_encoding
*eenc
)
1213 int eidx
= rb_enc_to_index(eenc
);
1214 if (eidx
== rb_usascii_encindex() &&
1215 rb_enc_str_coderange(str
) != ENC_CODERANGE_7BIT
) {
1216 rb_enc_associate_index(str
, rb_ascii8bit_encindex());
1219 rb_enc_associate_index(str
, eidx
);
1220 return rb_str_conv_enc(str
, eenc
, rb_default_internal_encoding());
1224 rb_external_str_new(const char *ptr
, long len
)
1226 return rb_external_str_new_with_enc(ptr
, len
, rb_default_external_encoding());
1230 rb_external_str_new_cstr(const char *ptr
)
1232 return rb_external_str_new_with_enc(ptr
, strlen(ptr
), rb_default_external_encoding());
1236 rb_locale_str_new(const char *ptr
, long len
)
1238 return rb_external_str_new_with_enc(ptr
, len
, rb_locale_encoding());
1242 rb_locale_str_new_cstr(const char *ptr
)
1244 return rb_external_str_new_with_enc(ptr
, strlen(ptr
), rb_locale_encoding());
1248 rb_filesystem_str_new(const char *ptr
, long len
)
1250 return rb_external_str_new_with_enc(ptr
, len
, rb_filesystem_encoding());
1254 rb_filesystem_str_new_cstr(const char *ptr
)
1256 return rb_external_str_new_with_enc(ptr
, strlen(ptr
), rb_filesystem_encoding());
1260 rb_str_export(VALUE str
)
1262 return rb_str_export_to_enc(str
, rb_default_external_encoding());
1266 rb_str_export_locale(VALUE str
)
1268 return rb_str_export_to_enc(str
, rb_locale_encoding());
1272 rb_str_export_to_enc(VALUE str
, rb_encoding
*enc
)
1274 return rb_str_conv_enc(str
, STR_ENC_GET(str
), enc
);
1278 str_replace_shared_without_enc(VALUE str2
, VALUE str
)
1280 const int termlen
= TERM_LEN(str
);
1284 RSTRING_GETMEM(str
, ptr
, len
);
1285 if (str_embed_capa(str2
) >= len
+ termlen
) {
1286 char *ptr2
= RSTRING(str2
)->as
.embed
.ary
;
1287 STR_SET_EMBED(str2
);
1288 memcpy(ptr2
, RSTRING_PTR(str
), len
);
1289 STR_SET_EMBED_LEN(str2
, len
);
1290 TERM_FILL(ptr2
+len
, termlen
);
1294 if (STR_SHARED_P(str
)) {
1295 root
= RSTRING(str
)->as
.heap
.aux
.shared
;
1296 RSTRING_GETMEM(str
, ptr
, len
);
1299 root
= rb_str_new_frozen(str
);
1300 RSTRING_GETMEM(root
, ptr
, len
);
1302 assert(OBJ_FROZEN(root
));
1303 if (!STR_EMBED_P(str2
) && !FL_TEST_RAW(str2
, STR_SHARED
|STR_NOFREE
)) {
1304 if (FL_TEST_RAW(str2
, STR_SHARED_ROOT
)) {
1305 rb_fatal("about to free a possible shared root");
1307 char *ptr2
= STR_HEAP_PTR(str2
);
1309 ruby_sized_xfree(ptr2
, STR_HEAP_SIZE(str2
));
1312 FL_SET(str2
, STR_NOEMBED
);
1313 RSTRING(str2
)->as
.heap
.len
= len
;
1314 RSTRING(str2
)->as
.heap
.ptr
= ptr
;
1315 STR_SET_SHARED(str2
, root
);
1321 str_replace_shared(VALUE str2
, VALUE str
)
1323 str_replace_shared_without_enc(str2
, str
);
1324 rb_enc_cr_str_exact_copy(str2
, str
);
1329 str_new_shared(VALUE klass
, VALUE str
)
1331 return str_replace_shared(str_alloc_heap(klass
), str
);
1335 rb_str_new_shared(VALUE str
)
1337 return str_new_shared(rb_obj_class(str
), str
);
1341 rb_str_new_frozen(VALUE orig
)
1343 if (OBJ_FROZEN(orig
)) return orig
;
1344 return str_new_frozen(rb_obj_class(orig
), orig
);
1348 rb_str_new_frozen_String(VALUE orig
)
1350 if (OBJ_FROZEN(orig
) && rb_obj_class(orig
) == rb_cString
) return orig
;
1351 return str_new_frozen(rb_cString
, orig
);
1355 rb_str_tmp_frozen_acquire(VALUE orig
)
1357 if (OBJ_FROZEN_RAW(orig
)) return orig
;
1358 return str_new_frozen_buffer(0, orig
, FALSE
);
1362 rb_str_tmp_frozen_release(VALUE orig
, VALUE tmp
)
1364 if (RBASIC_CLASS(tmp
) != 0)
1367 if (STR_EMBED_P(tmp
)) {
1368 assert(OBJ_FROZEN_RAW(tmp
));
1370 else if (FL_TEST_RAW(orig
, STR_SHARED
) &&
1371 !FL_TEST_RAW(orig
, STR_TMPLOCK
|RUBY_FL_FREEZE
)) {
1372 VALUE shared
= RSTRING(orig
)->as
.heap
.aux
.shared
;
1374 if (shared
== tmp
&& !FL_TEST_RAW(tmp
, STR_BORROWED
)) {
1375 assert(RSTRING(orig
)->as
.heap
.ptr
== RSTRING(tmp
)->as
.heap
.ptr
);
1376 assert(RSTRING(orig
)->as
.heap
.len
== RSTRING(tmp
)->as
.heap
.len
);
1378 /* Unshare orig since the root (tmp) only has this one child. */
1379 FL_UNSET_RAW(orig
, STR_SHARED
);
1380 RSTRING(orig
)->as
.heap
.aux
.capa
= RSTRING(tmp
)->as
.heap
.aux
.capa
;
1381 RBASIC(orig
)->flags
|= RBASIC(tmp
)->flags
& STR_NOFREE
;
1382 assert(OBJ_FROZEN_RAW(tmp
));
1384 /* Make tmp embedded and empty so it is safe for sweeping. */
1386 STR_SET_EMBED_LEN(tmp
, 0);
1392 str_new_frozen(VALUE klass
, VALUE orig
)
1394 return str_new_frozen_buffer(klass
, orig
, TRUE
);
1398 heap_str_make_shared(VALUE klass
, VALUE orig
)
1400 assert(!STR_EMBED_P(orig
));
1401 assert(!STR_SHARED_P(orig
));
1403 VALUE str
= str_alloc_heap(klass
);
1404 STR_SET_NOEMBED(str
);
1405 RSTRING(str
)->as
.heap
.len
= RSTRING_LEN(orig
);
1406 RSTRING(str
)->as
.heap
.ptr
= RSTRING_PTR(orig
);
1407 RSTRING(str
)->as
.heap
.aux
.capa
= RSTRING(orig
)->as
.heap
.aux
.capa
;
1408 RBASIC(str
)->flags
|= RBASIC(orig
)->flags
& STR_NOFREE
;
1409 RBASIC(orig
)->flags
&= ~STR_NOFREE
;
1410 STR_SET_SHARED(orig
, str
);
1412 FL_UNSET_RAW(str
, STR_BORROWED
);
1417 str_new_frozen_buffer(VALUE klass
, VALUE orig
, int copy_encoding
)
1421 long len
= RSTRING_LEN(orig
);
1422 int termlen
= copy_encoding
? TERM_LEN(orig
) : 1;
1424 if (STR_EMBED_P(orig
) || STR_EMBEDDABLE_P(len
, termlen
)) {
1425 str
= str_new0(klass
, RSTRING_PTR(orig
), len
, termlen
);
1426 assert(STR_EMBED_P(str
));
1429 if (FL_TEST_RAW(orig
, STR_SHARED
)) {
1430 VALUE shared
= RSTRING(orig
)->as
.heap
.aux
.shared
;
1431 long ofs
= RSTRING(orig
)->as
.heap
.ptr
- RSTRING_PTR(shared
);
1432 long rest
= RSTRING_LEN(shared
) - ofs
- RSTRING(orig
)->as
.heap
.len
;
1435 assert(ofs
+ rest
<= RSTRING_LEN(shared
));
1437 assert(!STR_EMBED_P(shared
));
1439 assert(OBJ_FROZEN(shared
));
1441 if ((ofs
> 0) || (rest
> 0) ||
1442 (klass
!= RBASIC(shared
)->klass
) ||
1443 ENCODING_GET(shared
) != ENCODING_GET(orig
)) {
1444 str
= str_new_shared(klass
, shared
);
1445 assert(!STR_EMBED_P(str
));
1446 RSTRING(str
)->as
.heap
.ptr
+= ofs
;
1447 RSTRING(str
)->as
.heap
.len
-= ofs
+ rest
;
1450 if (RBASIC_CLASS(shared
) == 0)
1451 FL_SET_RAW(shared
, STR_BORROWED
);
1455 else if (STR_EMBEDDABLE_P(RSTRING_LEN(orig
), TERM_LEN(orig
))) {
1456 str
= str_alloc_embed(klass
, RSTRING_LEN(orig
) + TERM_LEN(orig
));
1458 memcpy(RSTRING_PTR(str
), RSTRING_PTR(orig
), RSTRING_LEN(orig
));
1459 STR_SET_EMBED_LEN(str
, RSTRING_LEN(orig
));
1460 TERM_FILL(RSTRING_END(str
), TERM_LEN(orig
));
1463 str
= heap_str_make_shared(klass
, orig
);
1467 if (copy_encoding
) rb_enc_cr_str_exact_copy(str
, orig
);
1473 rb_str_new_with_class(VALUE obj
, const char *ptr
, long len
)
1475 return str_new0(rb_obj_class(obj
), ptr
, len
, TERM_LEN(obj
));
1479 str_new_empty_String(VALUE str
)
1481 VALUE v
= rb_str_new(0, 0);
1482 rb_enc_copy(v
, str
);
1486 #define STR_BUF_MIN_SIZE 63
1488 STATIC_ASSERT(STR_BUF_MIN_SIZE
, STR_BUF_MIN_SIZE
> RSTRING_EMBED_LEN_MAX
);
1492 rb_str_buf_new(long capa
)
1494 if (STR_EMBEDDABLE_P(capa
, 1)) {
1495 return str_alloc_embed(rb_cString
, capa
+ 1);
1498 VALUE str
= str_alloc_heap(rb_cString
);
1501 if (capa
< STR_BUF_MIN_SIZE
) {
1502 capa
= STR_BUF_MIN_SIZE
;
1505 FL_SET(str
, STR_NOEMBED
);
1506 RSTRING(str
)->as
.heap
.aux
.capa
= capa
;
1507 RSTRING(str
)->as
.heap
.ptr
= ALLOC_N(char, (size_t)capa
+ 1);
1508 RSTRING(str
)->as
.heap
.ptr
[0] = '\0';
1514 rb_str_buf_new_cstr(const char *ptr
)
1517 long len
= strlen(ptr
);
1519 str
= rb_str_buf_new(len
);
1520 rb_str_buf_cat(str
, ptr
, len
);
1526 rb_str_tmp_new(long len
)
1528 return str_new(0, 0, len
);
1532 rb_str_free(VALUE str
)
1534 if (FL_TEST(str
, RSTRING_FSTR
)) {
1535 st_data_t fstr
= (st_data_t
)str
;
1539 st_delete(rb_vm_fstring_table(), &fstr
, NULL
);
1540 RB_DEBUG_COUNTER_INC(obj_str_fstr
);
1545 if (STR_EMBED_P(str
)) {
1546 RB_DEBUG_COUNTER_INC(obj_str_embed
);
1548 else if (FL_TEST(str
, STR_SHARED
| STR_NOFREE
)) {
1549 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared
, FL_TEST(str
, STR_SHARED
));
1550 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared
, FL_TEST(str
, STR_NOFREE
));
1553 RB_DEBUG_COUNTER_INC(obj_str_ptr
);
1554 ruby_sized_xfree(STR_HEAP_PTR(str
), STR_HEAP_SIZE(str
));
1558 RUBY_FUNC_EXPORTED
size_t
1559 rb_str_memsize(VALUE str
)
1561 if (FL_TEST(str
, STR_NOEMBED
|STR_SHARED
|STR_NOFREE
) == STR_NOEMBED
) {
1562 return STR_HEAP_SIZE(str
);
1570 rb_str_to_str(VALUE str
)
1572 return rb_convert_type_with_id(str
, T_STRING
, "String", idTo_str
);
1575 static inline void str_discard(VALUE str
);
1576 static void str_shared_replace(VALUE str
, VALUE str2
);
1579 rb_str_shared_replace(VALUE str
, VALUE str2
)
1581 if (str
!= str2
) str_shared_replace(str
, str2
);
1585 str_shared_replace(VALUE str
, VALUE str2
)
1591 RUBY_ASSERT(str2
!= str
);
1592 enc
= STR_ENC_GET(str2
);
1593 cr
= ENC_CODERANGE(str2
);
1595 termlen
= rb_enc_mbminlen(enc
);
1597 if (str_embed_capa(str
) >= RSTRING_LEN(str2
) + termlen
) {
1599 memcpy(RSTRING_PTR(str
), RSTRING_PTR(str2
), (size_t)RSTRING_LEN(str2
) + termlen
);
1600 STR_SET_EMBED_LEN(str
, RSTRING_LEN(str2
));
1601 rb_enc_associate(str
, enc
);
1602 ENC_CODERANGE_SET(str
, cr
);
1606 if (STR_EMBED_P(str2
)) {
1607 assert(!FL_TEST(str2
, STR_SHARED
));
1608 long len
= RSTRING(str2
)->as
.embed
.len
;
1609 assert(len
+ termlen
<= str_embed_capa(str2
));
1611 char *new_ptr
= ALLOC_N(char, len
+ termlen
);
1612 memcpy(new_ptr
, RSTRING(str2
)->as
.embed
.ary
, len
+ termlen
);
1613 RSTRING(str2
)->as
.heap
.ptr
= new_ptr
;
1614 RSTRING(str2
)->as
.heap
.len
= len
;
1615 RSTRING(str2
)->as
.heap
.aux
.capa
= len
;
1616 STR_SET_NOEMBED(str2
);
1620 STR_SET_NOEMBED(str
);
1621 FL_UNSET(str
, STR_SHARED
);
1622 RSTRING(str
)->as
.heap
.ptr
= RSTRING_PTR(str2
);
1623 RSTRING(str
)->as
.heap
.len
= RSTRING_LEN(str2
);
1625 if (FL_TEST(str2
, STR_SHARED
)) {
1626 VALUE shared
= RSTRING(str2
)->as
.heap
.aux
.shared
;
1627 STR_SET_SHARED(str
, shared
);
1630 RSTRING(str
)->as
.heap
.aux
.capa
= RSTRING(str2
)->as
.heap
.aux
.capa
;
1634 STR_SET_EMBED(str2
);
1635 RSTRING_PTR(str2
)[0] = 0;
1636 STR_SET_EMBED_LEN(str2
, 0);
1637 rb_enc_associate(str
, enc
);
1638 ENC_CODERANGE_SET(str
, cr
);
1643 rb_obj_as_string(VALUE obj
)
1647 if (RB_TYPE_P(obj
, T_STRING
)) {
1650 str
= rb_funcall(obj
, idTo_s
, 0);
1651 return rb_obj_as_string_result(str
, obj
);
1654 MJIT_FUNC_EXPORTED VALUE
1655 rb_obj_as_string_result(VALUE str
, VALUE obj
)
1657 if (!RB_TYPE_P(str
, T_STRING
))
1658 return rb_any_to_s(obj
);
1663 str_replace(VALUE str
, VALUE str2
)
1667 len
= RSTRING_LEN(str2
);
1668 if (STR_SHARED_P(str2
)) {
1669 VALUE shared
= RSTRING(str2
)->as
.heap
.aux
.shared
;
1670 assert(OBJ_FROZEN(shared
));
1671 STR_SET_NOEMBED(str
);
1672 RSTRING(str
)->as
.heap
.len
= len
;
1673 RSTRING(str
)->as
.heap
.ptr
= RSTRING_PTR(str2
);
1674 STR_SET_SHARED(str
, shared
);
1675 rb_enc_cr_str_exact_copy(str
, str2
);
1678 str_replace_shared(str
, str2
);
1685 ec_str_alloc(struct rb_execution_context_struct
*ec
, VALUE klass
, size_t size
)
1688 RB_RVARGC_EC_NEWOBJ_OF(ec
, str
, struct RString
, klass
,
1689 T_STRING
| (RGENGC_WB_PROTECTED_STRING
? FL_WB_PROTECTED
: 0), size
);
1694 ec_str_alloc_embed(struct rb_execution_context_struct
*ec
, VALUE klass
, size_t capa
)
1696 size_t size
= str_embed_size(capa
);
1697 assert(rb_gc_size_allocatable_p(size
));
1699 assert(size
<= sizeof(struct RString
));
1701 return ec_str_alloc(ec
, klass
, size
);
1705 ec_str_alloc_heap(struct rb_execution_context_struct
*ec
, VALUE klass
)
1707 return ec_str_alloc(ec
, klass
, sizeof(struct RString
));
1711 str_duplicate_setup(VALUE klass
, VALUE str
, VALUE dup
)
1713 const VALUE flag_mask
=
1715 RSTRING_NOEMBED
| RSTRING_EMBED_LEN_MASK
|
1717 ENC_CODERANGE_MASK
| ENCODING_MASK
|
1720 VALUE flags
= FL_TEST_RAW(str
, flag_mask
);
1722 if (STR_EMBED_P(str
)) {
1723 long len
= RSTRING_EMBED_LEN(str
);
1725 assert(str_embed_capa(dup
) >= len
+ 1);
1726 STR_SET_EMBED_LEN(dup
, len
);
1727 MEMCPY(RSTRING(dup
)->as
.embed
.ary
, RSTRING(str
)->as
.embed
.ary
, char, len
+ 1);
1731 if (FL_TEST_RAW(str
, STR_SHARED
)) {
1732 root
= RSTRING(str
)->as
.heap
.aux
.shared
;
1734 else if (UNLIKELY(!(flags
& FL_FREEZE
))) {
1735 root
= str
= str_new_frozen(klass
, str
);
1736 flags
= FL_TEST_RAW(str
, flag_mask
);
1738 assert(!STR_SHARED_P(root
));
1739 assert(RB_OBJ_FROZEN_RAW(root
));
1743 if (STR_EMBED_P(root
)) {
1744 MEMCPY(RSTRING(dup
)->as
.embed
.ary
, RSTRING(root
)->as
.embed
.ary
,
1745 char, RSTRING_EMBED_LEN_MAX
+ 1);
1749 RSTRING(dup
)->as
.heap
.len
= RSTRING_LEN(str
);
1750 RSTRING(dup
)->as
.heap
.ptr
= RSTRING_PTR(str
);
1751 RB_OBJ_WRITE(dup
, &RSTRING(dup
)->as
.heap
.aux
.shared
, root
);
1752 flags
|= RSTRING_NOEMBED
| STR_SHARED
;
1756 if ((flags
& ENCODING_MASK
) == (ENCODING_INLINE_MAX
<<ENCODING_SHIFT
)) {
1757 encidx
= rb_enc_get_index(str
);
1758 flags
&= ~ENCODING_MASK
;
1760 FL_SET_RAW(dup
, flags
& ~FL_FREEZE
);
1761 if (encidx
) rb_enc_associate_index(dup
, encidx
);
1766 ec_str_duplicate(struct rb_execution_context_struct
*ec
, VALUE klass
, VALUE str
)
1769 if (!USE_RVARGC
|| FL_TEST(str
, STR_NOEMBED
)) {
1770 dup
= ec_str_alloc_heap(ec
, klass
);
1773 dup
= ec_str_alloc_embed(ec
, klass
, RSTRING_EMBED_LEN(str
) + TERM_LEN(str
));
1776 return str_duplicate_setup(klass
, str
, dup
);
1780 str_duplicate(VALUE klass
, VALUE str
)
1783 if (!USE_RVARGC
|| FL_TEST(str
, STR_NOEMBED
)) {
1784 dup
= str_alloc_heap(klass
);
1787 dup
= str_alloc_embed(klass
, RSTRING_EMBED_LEN(str
) + TERM_LEN(str
));
1790 return str_duplicate_setup(klass
, str
, dup
);
1794 rb_str_dup(VALUE str
)
1796 return str_duplicate(rb_obj_class(str
), str
);
1800 rb_str_resurrect(VALUE str
)
1802 RUBY_DTRACE_CREATE_HOOK(STRING
, RSTRING_LEN(str
));
1803 return str_duplicate(rb_cString
, str
);
1807 rb_ec_str_resurrect(struct rb_execution_context_struct
*ec
, VALUE str
)
1809 RUBY_DTRACE_CREATE_HOOK(STRING
, RSTRING_LEN(str
));
1810 return ec_str_duplicate(ec
, rb_cString
, str
);
1815 * String.new(string = '') -> new_string
1816 * String.new(string = '', encoding: encoding) -> new_string
1817 * String.new(string = '', capacity: size) -> new_string
1819 * Returns a new \String that is a copy of +string+.
1821 * With no arguments, returns the empty string with the Encoding <tt>ASCII-8BIT</tt>:
1824 * s.encoding # => #<Encoding:ASCII-8BIT>
1826 * With the single \String argument +string+, returns a copy of +string+
1827 * with the same encoding as +string+:
1828 * s = String.new("Que veut dire \u{e7}a?")
1829 * s # => "Que veut dire \u{e7}a?"
1830 * s.encoding # => #<Encoding:UTF-8>
1832 * Literal strings like <tt>""</tt> or here-documents always use
1833 * {script encoding}[Encoding.html#class-Encoding-label-Script+encoding], unlike String.new.
1835 * With keyword +encoding+, returns a copy of +str+
1836 * with the specified encoding:
1837 * s = String.new(encoding: 'ASCII')
1838 * s.encoding # => #<Encoding:US-ASCII>
1839 * s = String.new('foo', encoding: 'ASCII')
1840 * s.encoding # => #<Encoding:US-ASCII>
1842 * Note that these are equivalent:
1843 * s0 = String.new('foo', encoding: 'ASCII')
1844 * s1 = 'foo'.force_encoding('ASCII')
1845 * s0.encoding == s1.encoding # => true
1847 * With keyword +capacity+, returns a copy of +str+;
1848 * the given +capacity+ may set the size of the internal buffer,
1849 * which may affect performance:
1850 * String.new(capacity: 1) # => ""
1851 * String.new(capacity: 4096) # => ""
1853 * The +string+, +encoding+, and +capacity+ arguments may all be used together:
1855 * String.new('hello', encoding: 'UTF-8', capacity: 25)
1860 rb_str_init(int argc
, VALUE
*argv
, VALUE str
)
1862 static ID keyword_ids
[2];
1863 VALUE orig
, opt
, venc
, vcapa
;
1865 rb_encoding
*enc
= 0;
1868 if (!keyword_ids
[0]) {
1869 keyword_ids
[0] = rb_id_encoding();
1870 CONST_ID(keyword_ids
[1], "capacity");
1873 n
= rb_scan_args(argc
, argv
, "01:", &orig
, &opt
);
1875 rb_get_kwargs(opt
, keyword_ids
, 0, 2, kwargs
);
1878 if (venc
!= Qundef
&& !NIL_P(venc
)) {
1879 enc
= rb_to_encoding(venc
);
1881 if (vcapa
!= Qundef
&& !NIL_P(vcapa
)) {
1882 long capa
= NUM2LONG(vcapa
);
1884 int termlen
= enc
? rb_enc_mbminlen(enc
) : 1;
1886 if (capa
< STR_BUF_MIN_SIZE
) {
1887 capa
= STR_BUF_MIN_SIZE
;
1891 len
= RSTRING_LEN(orig
);
1895 if (orig
== str
) n
= 0;
1897 str_modifiable(str
);
1898 if (STR_EMBED_P(str
)) { /* make noembed always */
1899 char *new_ptr
= ALLOC_N(char, (size_t)capa
+ termlen
);
1901 assert(RSTRING(str
)->as
.embed
.len
+ 1 <= str_embed_capa(str
));
1902 memcpy(new_ptr
, RSTRING(str
)->as
.embed
.ary
, RSTRING(str
)->as
.embed
.len
+ 1);
1904 memcpy(new_ptr
, RSTRING(str
)->as
.embed
.ary
, RSTRING_EMBED_LEN_MAX
+ 1);
1906 RSTRING(str
)->as
.heap
.ptr
= new_ptr
;
1908 else if (FL_TEST(str
, STR_SHARED
|STR_NOFREE
)) {
1909 const size_t size
= (size_t)capa
+ termlen
;
1910 const char *const old_ptr
= RSTRING_PTR(str
);
1911 const size_t osize
= RSTRING(str
)->as
.heap
.len
+ TERM_LEN(str
);
1912 char *new_ptr
= ALLOC_N(char, (size_t)capa
+ termlen
);
1913 memcpy(new_ptr
, old_ptr
, osize
< size
? osize
: size
);
1914 FL_UNSET_RAW(str
, STR_SHARED
|STR_NOFREE
);
1915 RSTRING(str
)->as
.heap
.ptr
= new_ptr
;
1917 else if (STR_HEAP_SIZE(str
) != (size_t)capa
+ termlen
) {
1918 SIZED_REALLOC_N(RSTRING(str
)->as
.heap
.ptr
, char,
1919 (size_t)capa
+ termlen
, STR_HEAP_SIZE(str
));
1921 RSTRING(str
)->as
.heap
.len
= len
;
1922 TERM_FILL(&RSTRING(str
)->as
.heap
.ptr
[len
], termlen
);
1924 memcpy(RSTRING(str
)->as
.heap
.ptr
, RSTRING_PTR(orig
), len
);
1925 rb_enc_cr_str_exact_copy(str
, orig
);
1927 FL_SET(str
, STR_NOEMBED
);
1928 RSTRING(str
)->as
.heap
.aux
.capa
= capa
;
1931 rb_str_replace(str
, orig
);
1934 rb_enc_associate(str
, enc
);
1935 ENC_CODERANGE_CLEAR(str
);
1939 rb_str_replace(str
, orig
);
1944 #ifdef NONASCII_MASK
1945 #define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
1948 * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx
1949 * bit representation. (see https://en.wikipedia.org/wiki/UTF-8)
1950 * Therefore, the following pseudocode can detect UTF-8 leading bytes.
1952 * if (!(byte & 0x80))
1953 * byte |= 0x40; // turn on bit6
1954 * return ((byte>>6) & 1); // bit6 represent whether this byte is leading or not.
1956 * This function calculates whether a byte is leading or not for all bytes
1957 * in the argument word by concurrently using the above logic, and then
1958 * adds up the number of leading bytes in the word.
1960 static inline uintptr_t
1961 count_utf8_lead_bytes_with_word(const uintptr_t *s
)
1965 /* Transform so that bit0 indicates whether we have a UTF-8 leading byte or not. */
1966 d
= (d
>>6) | (~d
>>7);
1967 d
&= NONASCII_MASK
>> 7;
1969 /* Gather all bytes. */
1970 #if defined(HAVE_BUILTIN___BUILTIN_POPCOUNT) && defined(__POPCNT__)
1971 /* use only if it can use POPCNT */
1972 return rb_popcount_intptr(d
);
1976 # if SIZEOF_VOIDP == 8
1985 enc_strlen(const char *p
, const char *e
, rb_encoding
*enc
, int cr
)
1990 if (rb_enc_mbmaxlen(enc
) == rb_enc_mbminlen(enc
)) {
1991 long diff
= (long)(e
- p
);
1992 return diff
/ rb_enc_mbminlen(enc
) + !!(diff
% rb_enc_mbminlen(enc
));
1994 #ifdef NONASCII_MASK
1995 else if (cr
== ENC_CODERANGE_VALID
&& enc
== rb_utf8_encoding()) {
1997 if ((int)sizeof(uintptr_t) * 2 < e
- p
) {
1998 const uintptr_t *s
, *t
;
1999 const uintptr_t lowbits
= sizeof(uintptr_t) - 1;
2000 s
= (const uintptr_t*)(~lowbits
& ((uintptr_t)p
+ lowbits
));
2001 t
= (const uintptr_t*)(~lowbits
& (uintptr_t)e
);
2002 while (p
< (const char *)s
) {
2003 if (is_utf8_lead_byte(*p
)) len
++;
2007 len
+= count_utf8_lead_bytes_with_word(s
);
2010 p
= (const char *)s
;
2013 if (is_utf8_lead_byte(*p
)) len
++;
2019 else if (rb_enc_asciicompat(enc
)) {
2021 if (ENC_CODERANGE_CLEAN_P(cr
)) {
2024 q
= search_nonascii(p
, e
);
2030 p
+= rb_enc_fast_mbclen(p
, e
, enc
);
2037 q
= search_nonascii(p
, e
);
2043 p
+= rb_enc_mbclen(p
, e
, enc
);
2050 for (c
=0; p
<e
; c
++) {
2051 p
+= rb_enc_mbclen(p
, e
, enc
);
2057 rb_enc_strlen(const char *p
, const char *e
, rb_encoding
*enc
)
2059 return enc_strlen(p
, e
, enc
, ENC_CODERANGE_UNKNOWN
);
2062 /* To get strlen with cr
2063 * Note that given cr is not used.
2066 rb_enc_strlen_cr(const char *p
, const char *e
, rb_encoding
*enc
, int *cr
)
2073 if (rb_enc_mbmaxlen(enc
) == rb_enc_mbminlen(enc
)) {
2074 long diff
= (long)(e
- p
);
2075 return diff
/ rb_enc_mbminlen(enc
) + !!(diff
% rb_enc_mbminlen(enc
));
2077 else if (rb_enc_asciicompat(enc
)) {
2081 q
= search_nonascii(p
, e
);
2083 if (!*cr
) *cr
= ENC_CODERANGE_7BIT
;
2089 ret
= rb_enc_precise_mbclen(p
, e
, enc
);
2090 if (MBCLEN_CHARFOUND_P(ret
)) {
2091 *cr
|= ENC_CODERANGE_VALID
;
2092 p
+= MBCLEN_CHARFOUND_LEN(ret
);
2095 *cr
= ENC_CODERANGE_BROKEN
;
2100 if (!*cr
) *cr
= ENC_CODERANGE_7BIT
;
2104 for (c
=0; p
<e
; c
++) {
2105 ret
= rb_enc_precise_mbclen(p
, e
, enc
);
2106 if (MBCLEN_CHARFOUND_P(ret
)) {
2107 *cr
|= ENC_CODERANGE_VALID
;
2108 p
+= MBCLEN_CHARFOUND_LEN(ret
);
2111 *cr
= ENC_CODERANGE_BROKEN
;
2112 if (p
+ rb_enc_mbminlen(enc
) <= e
)
2113 p
+= rb_enc_mbminlen(enc
);
2118 if (!*cr
) *cr
= ENC_CODERANGE_7BIT
;
2122 /* enc must be str's enc or rb_enc_check(str, str2) */
2124 str_strlen(VALUE str
, rb_encoding
*enc
)
2129 if (single_byte_optimizable(str
)) return RSTRING_LEN(str
);
2130 if (!enc
) enc
= STR_ENC_GET(str
);
2131 p
= RSTRING_PTR(str
);
2132 e
= RSTRING_END(str
);
2133 cr
= ENC_CODERANGE(str
);
2135 if (cr
== ENC_CODERANGE_UNKNOWN
) {
2136 long n
= rb_enc_strlen_cr(p
, e
, enc
, &cr
);
2137 if (cr
) ENC_CODERANGE_SET(str
, cr
);
2141 return enc_strlen(p
, e
, enc
, cr
);
2146 rb_str_strlen(VALUE str
)
2148 return str_strlen(str
, NULL
);
2155 * Returns the count of characters (not bytes) in +self+:
2157 * "\x80\u3042".length # => 2
2158 * "hello".length # => 5
2160 * String#size is an alias for String#length.
2162 * Related: String#bytesize.
2166 rb_str_length(VALUE str
)
2168 return LONG2NUM(str_strlen(str
, NULL
));
2173 * bytesize -> integer
2175 * Returns the count of bytes in +self+:
2177 * "\x80\u3042".bytesize # => 4
2178 * "hello".bytesize # => 5
2180 * Related: String#length.
2184 rb_str_bytesize(VALUE str
)
2186 return LONG2NUM(RSTRING_LEN(str
));
2191 * empty? -> true or false
2193 * Returns +true+ if the length of +self+ is zero, +false+ otherwise:
2195 * "hello".empty? # => false
2196 * " ".empty? # => false
2197 * "".empty? # => true
2202 rb_str_empty(VALUE str
)
2204 return RBOOL(RSTRING_LEN(str
) == 0);
2209 * string + other_string -> new_string
2211 * Returns a new \String containing +other_string+ concatenated to +self+:
2213 * "Hello from " + self.to_s # => "Hello from main"
2218 rb_str_plus(VALUE str1
, VALUE str2
)
2222 char *ptr1
, *ptr2
, *ptr3
;
2227 enc
= rb_enc_check_str(str1
, str2
);
2228 RSTRING_GETMEM(str1
, ptr1
, len1
);
2229 RSTRING_GETMEM(str2
, ptr2
, len2
);
2230 termlen
= rb_enc_mbminlen(enc
);
2231 if (len1
> LONG_MAX
- len2
) {
2232 rb_raise(rb_eArgError
, "string size too big");
2234 str3
= str_new0(rb_cString
, 0, len1
+len2
, termlen
);
2235 ptr3
= RSTRING_PTR(str3
);
2236 memcpy(ptr3
, ptr1
, len1
);
2237 memcpy(ptr3
+len1
, ptr2
, len2
);
2238 TERM_FILL(&ptr3
[len1
+len2
], termlen
);
2240 ENCODING_CODERANGE_SET(str3
, rb_enc_to_index(enc
),
2241 ENC_CODERANGE_AND(ENC_CODERANGE(str1
), ENC_CODERANGE(str2
)));
2247 /* A variant of rb_str_plus that does not raise but return Qundef instead. */
2248 MJIT_FUNC_EXPORTED VALUE
2249 rb_str_opt_plus(VALUE str1
, VALUE str2
)
2251 assert(RBASIC_CLASS(str1
) == rb_cString
);
2252 assert(RBASIC_CLASS(str2
) == rb_cString
);
2254 MAYBE_UNUSED(char) *ptr1
, *ptr2
;
2255 RSTRING_GETMEM(str1
, ptr1
, len1
);
2256 RSTRING_GETMEM(str2
, ptr2
, len2
);
2257 int enc1
= rb_enc_get_index(str1
);
2258 int enc2
= rb_enc_get_index(str2
);
2263 else if (enc2
< 0) {
2266 else if (enc1
!= enc2
) {
2269 else if (len1
> LONG_MAX
- len2
) {
2273 return rb_str_plus(str1
, str2
);
2280 * string * integer -> new_string
2282 * Returns a new \String containing +integer+ copies of +self+:
2284 * "Ho! " * 3 # => "Ho! Ho! Ho! "
2285 * "Ho! " * 0 # => ""
2290 rb_str_times(VALUE str
, VALUE times
)
2297 if (times
== INT2FIX(1)) {
2298 return str_duplicate(rb_cString
, str
);
2300 if (times
== INT2FIX(0)) {
2301 str2
= str_alloc_embed(rb_cString
, 0);
2302 rb_enc_copy(str2
, str
);
2305 len
= NUM2LONG(times
);
2307 rb_raise(rb_eArgError
, "negative argument");
2309 if (RSTRING_LEN(str
) == 1 && RSTRING_PTR(str
)[0] == 0) {
2310 if (STR_EMBEDDABLE_P(len
, 1)) {
2311 str2
= str_alloc_embed(rb_cString
, len
+ 1);
2312 memset(RSTRING_PTR(str2
), 0, len
+ 1);
2315 str2
= str_alloc_heap(rb_cString
);
2316 RSTRING(str2
)->as
.heap
.aux
.capa
= len
;
2317 RSTRING(str2
)->as
.heap
.ptr
= ZALLOC_N(char, (size_t)len
+ 1);
2318 STR_SET_NOEMBED(str2
);
2320 STR_SET_LEN(str2
, len
);
2321 rb_enc_copy(str2
, str
);
2324 if (len
&& LONG_MAX
/len
< RSTRING_LEN(str
)) {
2325 rb_raise(rb_eArgError
, "argument too big");
2328 len
*= RSTRING_LEN(str
);
2329 termlen
= TERM_LEN(str
);
2330 str2
= str_new0(rb_cString
, 0, len
, termlen
);
2331 ptr2
= RSTRING_PTR(str2
);
2333 n
= RSTRING_LEN(str
);
2334 memcpy(ptr2
, RSTRING_PTR(str
), n
);
2335 while (n
<= len
/2) {
2336 memcpy(ptr2
+ n
, ptr2
, n
);
2339 memcpy(ptr2
+ n
, ptr2
, len
-n
);
2341 STR_SET_LEN(str2
, len
);
2342 TERM_FILL(&ptr2
[len
], termlen
);
2343 rb_enc_cr_str_copy_for_substr(str2
, str
);
2350 * string % object -> new_string
2352 * Returns the result of formatting +object+ into the format specification +self+
2353 * (see Kernel#sprintf for formatting details):
2355 * "%05d" % 123 # => "00123"
2357 * If +self+ contains multiple substitutions, +object+ must be
2358 * an \Array or \Hash containing the values to be substituted:
2360 * "%-5s: %016x" % [ "ID", self.object_id ] # => "ID : 00002b054ec93168"
2361 * "foo = %{foo}" % {foo: 'bar'} # => "foo = bar"
2362 * "foo = %{foo}, baz = %{baz}" % {foo: 'bar', baz: 'bat'} # => "foo = bar, baz = bat"
2367 rb_str_format_m(VALUE str
, VALUE arg
)
2369 VALUE tmp
= rb_check_array_type(arg
);
2372 return rb_str_format(RARRAY_LENINT(tmp
), RARRAY_CONST_PTR(tmp
), str
);
2374 return rb_str_format(1, &arg
, str
);
2378 rb_check_lockedtmp(VALUE str
)
2380 if (FL_TEST(str
, STR_TMPLOCK
)) {
2381 rb_raise(rb_eRuntimeError
, "can't modify string; temporarily locked");
2386 str_modifiable(VALUE str
)
2388 rb_check_lockedtmp(str
);
2389 rb_check_frozen(str
);
2393 str_dependent_p(VALUE str
)
2395 if (STR_EMBED_P(str
) || !FL_TEST(str
, STR_SHARED
|STR_NOFREE
)) {
2404 str_independent(VALUE str
)
2406 str_modifiable(str
);
2407 return !str_dependent_p(str
);
2411 str_make_independent_expand(VALUE str
, long len
, long expand
, const int termlen
)
2415 long capa
= len
+ expand
;
2417 if (len
> capa
) len
= capa
;
2419 if (!STR_EMBED_P(str
) && str_embed_capa(str
) >= capa
+ termlen
) {
2420 ptr
= RSTRING(str
)->as
.heap
.ptr
;
2422 memcpy(RSTRING(str
)->as
.embed
.ary
, ptr
, len
);
2423 TERM_FILL(RSTRING(str
)->as
.embed
.ary
+ len
, termlen
);
2424 STR_SET_EMBED_LEN(str
, len
);
2428 ptr
= ALLOC_N(char, (size_t)capa
+ termlen
);
2429 oldptr
= RSTRING_PTR(str
);
2431 memcpy(ptr
, oldptr
, len
);
2433 if (FL_TEST_RAW(str
, STR_NOEMBED
|STR_NOFREE
|STR_SHARED
) == STR_NOEMBED
) {
2436 STR_SET_NOEMBED(str
);
2437 FL_UNSET(str
, STR_SHARED
|STR_NOFREE
);
2438 TERM_FILL(ptr
+ len
, termlen
);
2439 RSTRING(str
)->as
.heap
.ptr
= ptr
;
2440 RSTRING(str
)->as
.heap
.len
= len
;
2441 RSTRING(str
)->as
.heap
.aux
.capa
= capa
;
2445 rb_str_modify(VALUE str
)
2447 if (!str_independent(str
))
2448 str_make_independent(str
);
2449 ENC_CODERANGE_CLEAR(str
);
2453 rb_str_modify_expand(VALUE str
, long expand
)
2455 int termlen
= TERM_LEN(str
);
2456 long len
= RSTRING_LEN(str
);
2459 rb_raise(rb_eArgError
, "negative expanding string size");
2461 if (expand
>= LONG_MAX
- len
) {
2462 rb_raise(rb_eArgError
, "string size too big");
2465 if (!str_independent(str
)) {
2466 str_make_independent_expand(str
, len
, expand
, termlen
);
2468 else if (expand
> 0) {
2469 RESIZE_CAPA_TERM(str
, len
+ expand
, termlen
);
2471 ENC_CODERANGE_CLEAR(str
);
2474 /* As rb_str_modify(), but don't clear coderange */
2476 str_modify_keep_cr(VALUE str
)
2478 if (!str_independent(str
))
2479 str_make_independent(str
);
2480 if (ENC_CODERANGE(str
) == ENC_CODERANGE_BROKEN
)
2481 /* Force re-scan later */
2482 ENC_CODERANGE_CLEAR(str
);
2486 str_discard(VALUE str
)
2488 str_modifiable(str
);
2489 if (!STR_EMBED_P(str
) && !FL_TEST(str
, STR_SHARED
|STR_NOFREE
)) {
2490 ruby_sized_xfree(STR_HEAP_PTR(str
), STR_HEAP_SIZE(str
));
2491 RSTRING(str
)->as
.heap
.ptr
= 0;
2492 RSTRING(str
)->as
.heap
.len
= 0;
2497 rb_must_asciicompat(VALUE str
)
2499 rb_encoding
*enc
= rb_enc_get(str
);
2500 if (!rb_enc_asciicompat(enc
)) {
2501 rb_raise(rb_eEncCompatError
, "ASCII incompatible encoding: %s", rb_enc_name(enc
));
2506 rb_string_value(volatile VALUE
*ptr
)
2509 if (!RB_TYPE_P(s
, T_STRING
)) {
2510 s
= rb_str_to_str(s
);
2517 rb_string_value_ptr(volatile VALUE
*ptr
)
2519 VALUE str
= rb_string_value(ptr
);
2520 return RSTRING_PTR(str
);
2524 zero_filled(const char *s
, int n
)
2526 for (; n
> 0; --n
) {
2533 str_null_char(const char *s
, long len
, const int minlen
, rb_encoding
*enc
)
2535 const char *e
= s
+ len
;
2537 for (; s
+ minlen
<= e
; s
+= rb_enc_mbclen(s
, e
, enc
)) {
2538 if (zero_filled(s
, minlen
)) return s
;
2544 str_fill_term(VALUE str
, char *s
, long len
, int termlen
)
2546 /* This function assumes that (capa + termlen) bytes of memory
2547 * is allocated, like many other functions in this file.
2549 if (str_dependent_p(str
)) {
2550 if (!zero_filled(s
+ len
, termlen
))
2551 str_make_independent_expand(str
, len
, 0L, termlen
);
2554 TERM_FILL(s
+ len
, termlen
);
2557 return RSTRING_PTR(str
);
2561 rb_str_change_terminator_length(VALUE str
, const int oldtermlen
, const int termlen
)
2563 long capa
= str_capacity(str
, oldtermlen
) + oldtermlen
;
2564 long len
= RSTRING_LEN(str
);
2566 assert(capa
>= len
);
2567 if (capa
- len
< termlen
) {
2568 rb_check_lockedtmp(str
);
2569 str_make_independent_expand(str
, len
, 0L, termlen
);
2571 else if (str_dependent_p(str
)) {
2572 if (termlen
> oldtermlen
)
2573 str_make_independent_expand(str
, len
, 0L, termlen
);
2576 if (!STR_EMBED_P(str
)) {
2577 /* modify capa instead of realloc */
2578 assert(!FL_TEST((str
), STR_SHARED
));
2579 RSTRING(str
)->as
.heap
.aux
.capa
= capa
- termlen
;
2581 if (termlen
> oldtermlen
) {
2582 TERM_FILL(RSTRING_PTR(str
) + len
, termlen
);
2590 str_null_check(VALUE str
, int *w
)
2592 char *s
= RSTRING_PTR(str
);
2593 long len
= RSTRING_LEN(str
);
2594 rb_encoding
*enc
= rb_enc_get(str
);
2595 const int minlen
= rb_enc_mbminlen(enc
);
2599 if (str_null_char(s
, len
, minlen
, enc
)) {
2602 return str_fill_term(str
, s
, len
, minlen
);
2605 if (!s
|| memchr(s
, 0, len
)) {
2609 s
= str_fill_term(str
, s
, len
, minlen
);
2615 rb_str_to_cstr(VALUE str
)
2618 return str_null_check(str
, &w
);
2622 rb_string_value_cstr(volatile VALUE
*ptr
)
2624 VALUE str
= rb_string_value(ptr
);
2626 char *s
= str_null_check(str
, &w
);
2629 rb_raise(rb_eArgError
, "string contains null char");
2631 rb_raise(rb_eArgError
, "string contains null byte");
2637 rb_str_fill_terminator(VALUE str
, const int newminlen
)
2639 char *s
= RSTRING_PTR(str
);
2640 long len
= RSTRING_LEN(str
);
2641 return str_fill_term(str
, s
, len
, newminlen
);
2645 rb_check_string_type(VALUE str
)
2647 str
= rb_check_convert_type_with_id(str
, T_STRING
, "String", idTo_str
);
2653 * String.try_convert(object) -> object, new_string, or nil
2655 * If +object+ is a \String object, returns +object+.
2657 * Otherwise if +object+ responds to <tt>:to_str</tt>,
2658 * calls <tt>object.to_str</tt> and returns the result.
2660 * Returns +nil+ if +object+ does not respond to <tt>:to_str</tt>.
2662 * Raises an exception unless <tt>object.to_str</tt> returns a \String object.
2665 rb_str_s_try_convert(VALUE dummy
, VALUE str
)
2667 return rb_check_string_type(str
);
2671 str_nth_len(const char *p
, const char *e
, long *nthp
, rb_encoding
*enc
)
2674 if (rb_enc_mbmaxlen(enc
) == 1) {
2677 else if (rb_enc_mbmaxlen(enc
) == rb_enc_mbminlen(enc
)) {
2678 p
+= nth
* rb_enc_mbmaxlen(enc
);
2680 else if (rb_enc_asciicompat(enc
)) {
2681 const char *p2
, *e2
;
2684 while (p
< e
&& 0 < nth
) {
2691 p2
= search_nonascii(p
, e2
);
2700 n
= rb_enc_mbclen(p
, e
, enc
);
2711 while (p
< e
&& nth
--) {
2712 p
+= rb_enc_mbclen(p
, e
, enc
);
2721 rb_enc_nth(const char *p
, const char *e
, long nth
, rb_encoding
*enc
)
2723 return str_nth_len(p
, e
, &nth
, enc
);
2727 str_nth(const char *p
, const char *e
, long nth
, rb_encoding
*enc
, int singlebyte
)
2732 p
= str_nth_len(p
, e
, &nth
, enc
);
2739 /* char offset to byte offset */
2741 str_offset(const char *p
, const char *e
, long nth
, rb_encoding
*enc
, int singlebyte
)
2743 const char *pp
= str_nth(p
, e
, nth
, enc
, singlebyte
);
2744 if (!pp
) return e
- p
;
2749 rb_str_offset(VALUE str
, long pos
)
2751 return str_offset(RSTRING_PTR(str
), RSTRING_END(str
), pos
,
2752 STR_ENC_GET(str
), single_byte_optimizable(str
));
2755 #ifdef NONASCII_MASK
2757 str_utf8_nth(const char *p
, const char *e
, long *nthp
)
2760 if ((int)SIZEOF_VOIDP
* 2 < e
- p
&& (int)SIZEOF_VOIDP
* 2 < nth
) {
2761 const uintptr_t *s
, *t
;
2762 const uintptr_t lowbits
= SIZEOF_VOIDP
- 1;
2763 s
= (const uintptr_t*)(~lowbits
& ((uintptr_t)p
+ lowbits
));
2764 t
= (const uintptr_t*)(~lowbits
& (uintptr_t)e
);
2765 while (p
< (const char *)s
) {
2766 if (is_utf8_lead_byte(*p
)) nth
--;
2770 nth
-= count_utf8_lead_bytes_with_word(s
);
2772 } while (s
< t
&& (int)SIZEOF_VOIDP
<= nth
);
2776 if (is_utf8_lead_byte(*p
)) {
2777 if (nth
== 0) break;
2787 str_utf8_offset(const char *p
, const char *e
, long nth
)
2789 const char *pp
= str_utf8_nth(p
, e
, &nth
);
2794 /* byte offset to char offset */
2796 rb_str_sublen(VALUE str
, long pos
)
2798 if (single_byte_optimizable(str
) || pos
< 0)
2801 char *p
= RSTRING_PTR(str
);
2802 return enc_strlen(p
, p
+ pos
, STR_ENC_GET(str
), ENC_CODERANGE(str
));
2807 rb_str_subseq(VALUE str
, long beg
, long len
)
2811 if (!STR_EMBEDDABLE_P(len
, TERM_LEN(str
)) &&
2812 SHARABLE_SUBSTRING_P(beg
, len
, RSTRING_LEN(str
))) {
2814 str2
= rb_str_new_shared(rb_str_new_frozen_String(str
));
2815 RSTRING(str2
)->as
.heap
.ptr
+= beg
;
2816 olen
= RSTRING(str2
)->as
.heap
.len
;
2817 if (olen
> len
) RSTRING(str2
)->as
.heap
.len
= len
;
2820 str2
= rb_str_new(RSTRING_PTR(str
)+beg
, len
);
2824 rb_enc_cr_str_copy_for_substr(str2
, str
);
2830 rb_str_subpos(VALUE str
, long beg
, long *lenp
)
2834 long blen
= RSTRING_LEN(str
);
2835 rb_encoding
*enc
= STR_ENC_GET(str
);
2836 char *p
, *s
= RSTRING_PTR(str
), *e
= s
+ blen
;
2838 if (len
< 0) return 0;
2842 if (single_byte_optimizable(str
)) {
2843 if (beg
> blen
) return 0;
2846 if (beg
< 0) return 0;
2848 if (len
> blen
- beg
)
2850 if (len
< 0) return 0;
2855 if (len
> -beg
) len
= -beg
;
2856 if (-beg
* rb_enc_mbmaxlen(enc
) < RSTRING_LEN(str
) / 8) {
2858 while (beg
-- > len
&& (e
= rb_enc_prev_char(s
, e
, e
, enc
)) != 0);
2861 while (len
-- > 0 && (p
= rb_enc_prev_char(s
, p
, e
, enc
)) != 0);
2867 slen
= str_strlen(str
, enc
);
2869 if (beg
< 0) return 0;
2871 if (len
== 0) goto end
;
2874 else if (beg
> 0 && beg
> RSTRING_LEN(str
)) {
2878 if (beg
> str_strlen(str
, enc
)) return 0; /* str's enc */
2881 #ifdef NONASCII_MASK
2882 else if (ENC_CODERANGE(str
) == ENC_CODERANGE_VALID
&&
2883 enc
== rb_utf8_encoding()) {
2884 p
= str_utf8_nth(s
, e
, &beg
);
2885 if (beg
> 0) return 0;
2886 len
= str_utf8_offset(p
, e
, len
);
2889 else if (rb_enc_mbmaxlen(enc
) == rb_enc_mbminlen(enc
)) {
2890 int char_sz
= rb_enc_mbmaxlen(enc
);
2892 p
= s
+ beg
* char_sz
;
2896 else if (len
* char_sz
> e
- p
)
2901 else if ((p
= str_nth_len(s
, e
, &beg
, enc
)) == e
) {
2902 if (beg
> 0) return 0;
2906 len
= str_offset(p
, e
, len
, enc
, 0);
2914 static VALUE
str_substr(VALUE str
, long beg
, long len
, int empty
);
2917 rb_str_substr(VALUE str
, long beg
, long len
)
2919 return str_substr(str
, beg
, len
, TRUE
);
2923 str_substr(VALUE str
, long beg
, long len
, int empty
)
2926 char *p
= rb_str_subpos(str
, beg
, &len
);
2928 if (!p
) return Qnil
;
2929 if (!STR_EMBEDDABLE_P(len
, TERM_LEN(str
)) &&
2930 SHARABLE_SUBSTRING_P(p
, len
, RSTRING_END(str
))) {
2931 long ofs
= p
- RSTRING_PTR(str
);
2932 str2
= rb_str_new_frozen(str
);
2933 str2
= str_new_shared(rb_cString
, str2
);
2934 RSTRING(str2
)->as
.heap
.ptr
+= ofs
;
2935 RSTRING(str2
)->as
.heap
.len
= len
;
2936 ENC_CODERANGE_CLEAR(str2
);
2939 if (!len
&& !empty
) return Qnil
;
2940 str2
= rb_str_new(p
, len
);
2943 rb_enc_cr_str_copy_for_substr(str2
, str
);
2949 rb_str_freeze(VALUE str
)
2951 if (OBJ_FROZEN(str
)) return str
;
2952 rb_str_resize(str
, RSTRING_LEN(str
));
2953 return rb_obj_freeze(str
);
2959 * +string -> new_string or self
2961 * Returns +self+ if +self+ is not frozen.
2963 * Otherwise. returns <tt>self.dup</tt>, which is not frozen.
2966 str_uplus(VALUE str
)
2968 if (OBJ_FROZEN(str
)) {
2969 return rb_str_dup(str
);
2978 * -string -> frozen_string
2980 * Returns a frozen, possibly pre-existing copy of the string.
2982 * The returned \String will be deduplicated as long as it does not have
2983 * any instance variables set on it.
2986 str_uminus(VALUE str
)
2988 if (!BARE_STRING_P(str
) && !rb_obj_frozen_p(str
)) {
2989 str
= rb_str_dup(str
);
2991 return rb_fstring(str
);
2994 RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str
), rb_str_new_frozen
, (str
))
2995 #define rb_str_dup_frozen rb_str_new_frozen
2998 rb_str_locktmp(VALUE str
)
3000 if (FL_TEST(str
, STR_TMPLOCK
)) {
3001 rb_raise(rb_eRuntimeError
, "temporal locking already locked string");
3003 FL_SET(str
, STR_TMPLOCK
);
3008 rb_str_unlocktmp(VALUE str
)
3010 if (!FL_TEST(str
, STR_TMPLOCK
)) {
3011 rb_raise(rb_eRuntimeError
, "temporal unlocking already unlocked string");
3013 FL_UNSET(str
, STR_TMPLOCK
);
3017 RUBY_FUNC_EXPORTED VALUE
3018 rb_str_locktmp_ensure(VALUE str
, VALUE (*func
)(VALUE
), VALUE arg
)
3020 rb_str_locktmp(str
);
3021 return rb_ensure(func
, arg
, rb_str_unlocktmp
, str
);
3025 rb_str_set_len(VALUE str
, long len
)
3028 const int termlen
= TERM_LEN(str
);
3030 str_modifiable(str
);
3031 if (STR_SHARED_P(str
)) {
3032 rb_raise(rb_eRuntimeError
, "can't set length of shared string");
3034 if (len
> (capa
= (long)str_capacity(str
, termlen
)) || len
< 0) {
3035 rb_bug("probable buffer overflow: %ld for %ld", len
, capa
);
3037 STR_SET_LEN(str
, len
);
3038 TERM_FILL(&RSTRING_PTR(str
)[len
], termlen
);
3042 rb_str_resize(VALUE str
, long len
)
3048 rb_raise(rb_eArgError
, "negative string size (or size too big)");
3051 independent
= str_independent(str
);
3052 ENC_CODERANGE_CLEAR(str
);
3053 slen
= RSTRING_LEN(str
);
3057 const int termlen
= TERM_LEN(str
);
3058 if (STR_EMBED_P(str
)) {
3059 if (len
== slen
) return str
;
3060 if (str_embed_capa(str
) >= len
+ termlen
) {
3061 STR_SET_EMBED_LEN(str
, len
);
3062 TERM_FILL(RSTRING(str
)->as
.embed
.ary
+ len
, termlen
);
3065 str_make_independent_expand(str
, slen
, len
- slen
, termlen
);
3067 else if (str_embed_capa(str
) >= len
+ termlen
) {
3068 char *ptr
= STR_HEAP_PTR(str
);
3070 if (slen
> len
) slen
= len
;
3071 if (slen
> 0) MEMCPY(RSTRING(str
)->as
.embed
.ary
, ptr
, char, slen
);
3072 TERM_FILL(RSTRING(str
)->as
.embed
.ary
+ len
, termlen
);
3073 STR_SET_EMBED_LEN(str
, len
);
3074 if (independent
) ruby_xfree(ptr
);
3077 else if (!independent
) {
3078 if (len
== slen
) return str
;
3079 str_make_independent_expand(str
, slen
, len
- slen
, termlen
);
3081 else if ((capa
= RSTRING(str
)->as
.heap
.aux
.capa
) < len
||
3082 (capa
- len
) > (len
< 1024 ? len
: 1024)) {
3083 SIZED_REALLOC_N(RSTRING(str
)->as
.heap
.ptr
, char,
3084 (size_t)len
+ termlen
, STR_HEAP_SIZE(str
));
3085 RSTRING(str
)->as
.heap
.aux
.capa
= len
;
3087 else if (len
== slen
) return str
;
3088 RSTRING(str
)->as
.heap
.len
= len
;
3089 TERM_FILL(RSTRING(str
)->as
.heap
.ptr
+ len
, termlen
); /* sentinel */
3095 str_buf_cat(VALUE str
, const char *ptr
, long len
)
3097 long capa
, total
, olen
, off
= -1;
3099 const int termlen
= TERM_LEN(str
);
3101 assert(termlen
< RSTRING_EMBED_LEN_MAX
+ 1); /* < (LONG_MAX/2) */
3104 RSTRING_GETMEM(str
, sptr
, olen
);
3105 if (ptr
>= sptr
&& ptr
<= sptr
+ olen
) {
3109 if (len
== 0) return 0;
3110 if (STR_EMBED_P(str
)) {
3111 capa
= str_embed_capa(str
) - termlen
;
3112 sptr
= RSTRING(str
)->as
.embed
.ary
;
3113 olen
= RSTRING_EMBED_LEN(str
);
3116 capa
= RSTRING(str
)->as
.heap
.aux
.capa
;
3117 sptr
= RSTRING(str
)->as
.heap
.ptr
;
3118 olen
= RSTRING(str
)->as
.heap
.len
;
3120 if (olen
> LONG_MAX
- len
) {
3121 rb_raise(rb_eArgError
, "string sizes too big");
3125 if (total
>= LONG_MAX
/ 2) {
3128 while (total
> capa
) {
3129 capa
= 2 * capa
+ termlen
; /* == 2*(capa+termlen)-termlen */
3131 RESIZE_CAPA_TERM(str
, capa
, termlen
);
3132 sptr
= RSTRING_PTR(str
);
3137 memcpy(sptr
+ olen
, ptr
, len
);
3138 STR_SET_LEN(str
, total
);
3139 TERM_FILL(sptr
+ total
, termlen
); /* sentinel */
3144 #define str_buf_cat2(str, ptr) str_buf_cat((str), (ptr), strlen(ptr))
3147 rb_str_cat(VALUE str
, const char *ptr
, long len
)
3149 if (len
== 0) return str
;
3151 rb_raise(rb_eArgError
, "negative string size (or size too big)");
3153 return str_buf_cat(str
, ptr
, len
);
3157 rb_str_cat_cstr(VALUE str
, const char *ptr
)
3160 return rb_str_buf_cat(str
, ptr
, strlen(ptr
));
3163 RUBY_ALIAS_FUNCTION(rb_str_buf_cat(VALUE str
, const char *ptr
, long len
), rb_str_cat
, (str
, ptr
, len
))
3164 RUBY_ALIAS_FUNCTION(rb_str_buf_cat2(VALUE str
, const char *ptr
), rb_str_cat_cstr
, (str
, ptr
))
3165 RUBY_ALIAS_FUNCTION(rb_str_cat2(VALUE str
, const char *ptr
), rb_str_cat_cstr
, (str
, ptr
))
3168 rb_enc_cr_str_buf_cat(VALUE str
, const char *ptr
, long len
,
3169 int ptr_encindex
, int ptr_cr
, int *ptr_cr_ret
)
3171 int str_encindex
= ENCODING_GET(str
);
3174 rb_encoding
*str_enc
, *ptr_enc
;
3176 str_cr
= RSTRING_LEN(str
) ? ENC_CODERANGE(str
) : ENC_CODERANGE_7BIT
;
3178 if (str_encindex
== ptr_encindex
) {
3179 if (str_cr
!= ENC_CODERANGE_UNKNOWN
&& ptr_cr
== ENC_CODERANGE_UNKNOWN
) {
3180 ptr_cr
= coderange_scan(ptr
, len
, rb_enc_from_index(ptr_encindex
));
3184 str_enc
= rb_enc_from_index(str_encindex
);
3185 ptr_enc
= rb_enc_from_index(ptr_encindex
);
3186 if (!rb_enc_asciicompat(str_enc
) || !rb_enc_asciicompat(ptr_enc
)) {
3189 if (RSTRING_LEN(str
) == 0) {
3190 rb_str_buf_cat(str
, ptr
, len
);
3191 ENCODING_CODERANGE_SET(str
, ptr_encindex
, ptr_cr
);
3192 rb_str_change_terminator_length(str
, rb_enc_mbminlen(str_enc
), rb_enc_mbminlen(ptr_enc
));
3197 if (ptr_cr
== ENC_CODERANGE_UNKNOWN
) {
3198 ptr_cr
= coderange_scan(ptr
, len
, ptr_enc
);
3200 if (str_cr
== ENC_CODERANGE_UNKNOWN
) {
3201 if (ENCODING_IS_ASCII8BIT(str
) || ptr_cr
!= ENC_CODERANGE_7BIT
) {
3202 str_cr
= rb_enc_str_coderange(str
);
3207 *ptr_cr_ret
= ptr_cr
;
3209 if (str_encindex
!= ptr_encindex
&&
3210 str_cr
!= ENC_CODERANGE_7BIT
&&
3211 ptr_cr
!= ENC_CODERANGE_7BIT
) {
3212 str_enc
= rb_enc_from_index(str_encindex
);
3213 ptr_enc
= rb_enc_from_index(ptr_encindex
);
3217 if (str_cr
== ENC_CODERANGE_UNKNOWN
) {
3218 res_encindex
= str_encindex
;
3219 res_cr
= ENC_CODERANGE_UNKNOWN
;
3221 else if (str_cr
== ENC_CODERANGE_7BIT
) {
3222 if (ptr_cr
== ENC_CODERANGE_7BIT
) {
3223 res_encindex
= str_encindex
;
3224 res_cr
= ENC_CODERANGE_7BIT
;
3227 res_encindex
= ptr_encindex
;
3231 else if (str_cr
== ENC_CODERANGE_VALID
) {
3232 res_encindex
= str_encindex
;
3233 if (ENC_CODERANGE_CLEAN_P(ptr_cr
))
3238 else { /* str_cr == ENC_CODERANGE_BROKEN */
3239 res_encindex
= str_encindex
;
3241 if (0 < len
) res_cr
= ENC_CODERANGE_UNKNOWN
;
3245 rb_raise(rb_eArgError
, "negative string size (or size too big)");
3247 str_buf_cat(str
, ptr
, len
);
3248 ENCODING_CODERANGE_SET(str
, res_encindex
, res_cr
);
3252 rb_raise(rb_eEncCompatError
, "incompatible character encodings: %s and %s",
3253 rb_enc_name(str_enc
), rb_enc_name(ptr_enc
));
3254 UNREACHABLE_RETURN(Qundef
);
3258 rb_enc_str_buf_cat(VALUE str
, const char *ptr
, long len
, rb_encoding
*ptr_enc
)
3260 return rb_enc_cr_str_buf_cat(str
, ptr
, len
,
3261 rb_enc_to_index(ptr_enc
), ENC_CODERANGE_UNKNOWN
, NULL
);
3265 rb_str_buf_cat_ascii(VALUE str
, const char *ptr
)
3267 /* ptr must reference NUL terminated ASCII string. */
3268 int encindex
= ENCODING_GET(str
);
3269 rb_encoding
*enc
= rb_enc_from_index(encindex
);
3270 if (rb_enc_asciicompat(enc
)) {
3271 return rb_enc_cr_str_buf_cat(str
, ptr
, strlen(ptr
),
3272 encindex
, ENC_CODERANGE_7BIT
, 0);
3275 char *buf
= ALLOCA_N(char, rb_enc_mbmaxlen(enc
));
3277 unsigned int c
= (unsigned char)*ptr
;
3278 int len
= rb_enc_codelen(c
, enc
);
3279 rb_enc_mbcput(c
, buf
, enc
);
3280 rb_enc_cr_str_buf_cat(str
, buf
, len
,
3281 encindex
, ENC_CODERANGE_VALID
, 0);
3289 rb_str_buf_append(VALUE str
, VALUE str2
)
3293 str2_cr
= ENC_CODERANGE(str2
);
3295 rb_enc_cr_str_buf_cat(str
, RSTRING_PTR(str2
), RSTRING_LEN(str2
),
3296 ENCODING_GET(str2
), str2_cr
, &str2_cr
);
3298 ENC_CODERANGE_SET(str2
, str2_cr
);
3304 rb_str_append(VALUE str
, VALUE str2
)
3307 return rb_str_buf_append(str
, str2
);
3310 #define MIN_PRE_ALLOC_SIZE 48
3312 MJIT_FUNC_EXPORTED VALUE
3313 rb_str_concat_literals(size_t num
, const VALUE
*strary
)
3319 if (UNLIKELY(!num
)) return rb_str_new(0, 0);
3320 if (UNLIKELY(num
== 1)) return rb_str_resurrect(strary
[0]);
3322 for (i
= 0; i
< num
; ++i
) { len
+= RSTRING_LEN(strary
[i
]); }
3323 if (LIKELY(len
< MIN_PRE_ALLOC_SIZE
)) {
3324 str
= rb_str_resurrect(strary
[0]);
3328 str
= rb_str_buf_new(len
);
3329 rb_enc_copy(str
, strary
[0]);
3333 for (i
= s
; i
< num
; ++i
) {
3334 const VALUE v
= strary
[i
];
3335 int encidx
= ENCODING_GET(v
);
3337 rb_enc_cr_str_buf_cat(str
, RSTRING_PTR(v
), RSTRING_LEN(v
),
3338 encidx
, ENC_CODERANGE(v
), NULL
);
3339 if (encidx
!= ENCINDEX_US_ASCII
) {
3340 if (ENCODING_GET_INLINED(str
) == ENCINDEX_US_ASCII
)
3341 rb_enc_set_index(str
, encidx
);
3349 * concat(*objects) -> string
3351 * Concatenates each object in +objects+ to +self+ and returns +self+:
3354 * s.concat('bar', 'baz') # => "foobarbaz"
3355 * s # => "foobarbaz"
3357 * For each given object +object+ that is an \Integer,
3358 * the value is considered a codepoint and converted to a character before concatenation:
3361 * s.concat(32, 'bar', 32, 'baz') # => "foo bar baz"
3363 * Related: String#<<, which takes a single argument.
3366 rb_str_concat_multi(int argc
, VALUE
*argv
, VALUE str
)
3368 str_modifiable(str
);
3371 return rb_str_concat(str
, argv
[0]);
3373 else if (argc
> 1) {
3375 VALUE arg_str
= rb_str_tmp_new(0);
3376 rb_enc_copy(arg_str
, str
);
3377 for (i
= 0; i
< argc
; i
++) {
3378 rb_str_concat(arg_str
, argv
[i
]);
3380 rb_str_buf_append(str
, arg_str
);
3388 * string << object -> string
3390 * Concatenates +object+ to +self+ and returns +self+:
3393 * s << 'bar' # => "foobar"
3396 * If +object+ is an \Integer,
3397 * the value is considered a codepoint and converted to a character before concatenation:
3400 * s << 33 # => "foo!"
3402 * Related: String#concat, which takes multiple arguments.
3405 rb_str_concat(VALUE str1
, VALUE str2
)
3408 rb_encoding
*enc
= STR_ENC_GET(str1
);
3411 if (RB_INTEGER_TYPE_P(str2
)) {
3412 if (rb_num_to_uint(str2
, &code
) == 0) {
3414 else if (FIXNUM_P(str2
)) {
3415 rb_raise(rb_eRangeError
, "%ld out of char range", FIX2LONG(str2
));
3418 rb_raise(rb_eRangeError
, "bignum out of char range");
3422 return rb_str_append(str1
, str2
);
3425 encidx
= rb_enc_to_index(enc
);
3426 if (encidx
== ENCINDEX_ASCII
|| encidx
== ENCINDEX_US_ASCII
) {
3427 /* US-ASCII automatically extended to ASCII-8BIT */
3429 buf
[0] = (char)code
;
3431 rb_raise(rb_eRangeError
, "%u out of char range", code
);
3433 rb_str_cat(str1
, buf
, 1);
3434 if (encidx
== ENCINDEX_US_ASCII
&& code
> 127) {
3435 rb_enc_associate_index(str1
, ENCINDEX_ASCII
);
3436 ENC_CODERANGE_SET(str1
, ENC_CODERANGE_VALID
);
3440 long pos
= RSTRING_LEN(str1
);
3441 int cr
= ENC_CODERANGE(str1
);
3445 switch (len
= rb_enc_codelen(code
, enc
)) {
3446 case ONIGERR_INVALID_CODE_POINT_VALUE
:
3447 rb_raise(rb_eRangeError
, "invalid codepoint 0x%X in %s", code
, rb_enc_name(enc
));
3449 case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE
:
3451 rb_raise(rb_eRangeError
, "%u out of char range", code
);
3454 buf
= ALLOCA_N(char, len
+ 1);
3455 rb_enc_mbcput(code
, buf
, enc
);
3456 if (rb_enc_precise_mbclen(buf
, buf
+ len
+ 1, enc
) != len
) {
3457 rb_raise(rb_eRangeError
, "invalid codepoint 0x%X in %s", code
, rb_enc_name(enc
));
3459 rb_str_resize(str1
, pos
+len
);
3460 memcpy(RSTRING_PTR(str1
) + pos
, buf
, len
);
3461 if (cr
== ENC_CODERANGE_7BIT
&& code
> 127)
3462 cr
= ENC_CODERANGE_VALID
;
3463 ENC_CODERANGE_SET(str1
, cr
);
3470 * prepend(*other_strings) -> string
3472 * Prepends each string in +other_strings+ to +self+ and returns +self+:
3475 * s.prepend('bar', 'baz') # => "barbazfoo"
3476 * s # => "barbazfoo"
3478 * Related: String#concat.
3482 rb_str_prepend_multi(int argc
, VALUE
*argv
, VALUE str
)
3484 str_modifiable(str
);
3487 rb_str_update(str
, 0L, 0L, argv
[0]);
3489 else if (argc
> 1) {
3491 VALUE arg_str
= rb_str_tmp_new(0);
3492 rb_enc_copy(arg_str
, str
);
3493 for (i
= 0; i
< argc
; i
++) {
3494 rb_str_append(arg_str
, argv
[i
]);
3496 rb_str_update(str
, 0L, 0L, arg_str
);
3503 rb_str_hash(VALUE str
)
3505 int e
= ENCODING_GET(str
);
3506 if (e
&& rb_enc_str_coderange(str
) == ENC_CODERANGE_7BIT
) {
3509 return rb_memhash((const void *)RSTRING_PTR(str
), RSTRING_LEN(str
)) ^ e
;
3513 rb_str_hash_cmp(VALUE str1
, VALUE str2
)
3516 const char *ptr1
, *ptr2
;
3517 RSTRING_GETMEM(str1
, ptr1
, len1
);
3518 RSTRING_GETMEM(str2
, ptr2
, len2
);
3519 return (len1
!= len2
||
3520 !rb_str_comparable(str1
, str2
) ||
3521 memcmp(ptr1
, ptr2
, len1
) != 0);
3528 * Returns the integer hash value for +self+.
3529 * The value is based on the length, content and encoding of +self+.
3531 * Related: Object#hash.
3535 rb_str_hash_m(VALUE str
)
3537 st_index_t hval
= rb_str_hash(str
);
3538 return ST2FIX(hval
);
3541 #define lesser(a,b) (((a)>(b))?(b):(a))
3544 rb_str_comparable(VALUE str1
, VALUE str2
)
3549 if (RSTRING_LEN(str1
) == 0) return TRUE
;
3550 if (RSTRING_LEN(str2
) == 0) return TRUE
;
3551 idx1
= ENCODING_GET(str1
);
3552 idx2
= ENCODING_GET(str2
);
3553 if (idx1
== idx2
) return TRUE
;
3554 rc1
= rb_enc_str_coderange(str1
);
3555 rc2
= rb_enc_str_coderange(str2
);
3556 if (rc1
== ENC_CODERANGE_7BIT
) {
3557 if (rc2
== ENC_CODERANGE_7BIT
) return TRUE
;
3558 if (rb_enc_asciicompat(rb_enc_from_index(idx2
)))
3561 if (rc2
== ENC_CODERANGE_7BIT
) {
3562 if (rb_enc_asciicompat(rb_enc_from_index(idx1
)))
3569 rb_str_cmp(VALUE str1
, VALUE str2
)
3572 const char *ptr1
, *ptr2
;
3575 if (str1
== str2
) return 0;
3576 RSTRING_GETMEM(str1
, ptr1
, len1
);
3577 RSTRING_GETMEM(str2
, ptr2
, len2
);
3578 if (ptr1
== ptr2
|| (retval
= memcmp(ptr1
, ptr2
, lesser(len1
, len2
))) == 0) {
3580 if (!rb_str_comparable(str1
, str2
)) {
3581 if (ENCODING_GET(str1
) > ENCODING_GET(str2
))
3587 if (len1
> len2
) return 1;
3590 if (retval
> 0) return 1;
3596 * string == object -> true or false
3597 * string === object -> true or false
3599 * Returns +true+ if +object+ has the same length and content;
3600 * as +self+; +false+ otherwise:
3603 * s == 'foo' # => true
3604 * s == 'food' # => false
3605 * s == 'FOO' # => false
3607 * Returns +false+ if the two strings' encodings are not compatible:
3608 * "\u{e4 f6 fc}".encode("ISO-8859-1") == ("\u{c4 d6 dc}") # => false
3610 * If +object+ is not an instance of \String but responds to +to_str+, then the
3611 * two strings are compared using <code>object.==</code>.
3615 rb_str_equal(VALUE str1
, VALUE str2
)
3617 if (str1
== str2
) return Qtrue
;
3618 if (!RB_TYPE_P(str2
, T_STRING
)) {
3619 if (!rb_respond_to(str2
, idTo_str
)) {
3622 return rb_equal(str2
, str1
);
3624 return rb_str_eql_internal(str1
, str2
);
3629 * eql?(object) -> true or false
3631 * Returns +true+ if +object+ has the same length and content;
3632 * as +self+; +false+ otherwise:
3635 * s.eql?('foo') # => true
3636 * s.eql?('food') # => false
3637 * s.eql?('FOO') # => false
3639 * Returns +false+ if the two strings' encodings are not compatible:
3641 * "\u{e4 f6 fc}".encode("ISO-8859-1").eql?("\u{c4 d6 dc}") # => false
3645 MJIT_FUNC_EXPORTED VALUE
3646 rb_str_eql(VALUE str1
, VALUE str2
)
3648 if (str1
== str2
) return Qtrue
;
3649 if (!RB_TYPE_P(str2
, T_STRING
)) return Qfalse
;
3650 return rb_str_eql_internal(str1
, str2
);
3655 * string <=> other_string -> -1, 0, 1, or nil
3657 * Compares +self+ and +other_string+, returning:
3659 * - -1 if +other_string+ is larger.
3660 * - 0 if the two are equal.
3661 * - 1 if +other_string+ is smaller.
3662 * - +nil+ if the two are incomparable.
3666 * 'foo' <=> 'foo' # => 0
3667 * 'foo' <=> 'food' # => -1
3668 * 'food' <=> 'foo' # => 1
3669 * 'FOO' <=> 'foo' # => -1
3670 * 'foo' <=> 'FOO' # => 1
3671 * 'foo' <=> 1 # => nil
3676 rb_str_cmp_m(VALUE str1
, VALUE str2
)
3679 VALUE s
= rb_check_string_type(str2
);
3681 return rb_invcmp(str1
, str2
);
3683 result
= rb_str_cmp(str1
, s
);
3684 return INT2FIX(result
);
3687 static VALUE
str_casecmp(VALUE str1
, VALUE str2
);
3688 static VALUE
str_casecmp_p(VALUE str1
, VALUE str2
);
3692 * casecmp(other_string) -> -1, 0, 1, or nil
3694 * Compares <tt>self.downcase</tt> and <tt>other_string.downcase</tt>; returns:
3696 * - -1 if <tt>other_string.downcase</tt> is larger.
3697 * - 0 if the two are equal.
3698 * - 1 if <tt>other_string.downcase</tt> is smaller.
3699 * - +nil+ if the two are incomparable.
3703 * 'foo'.casecmp('foo') # => 0
3704 * 'foo'.casecmp('food') # => -1
3705 * 'food'.casecmp('foo') # => 1
3706 * 'FOO'.casecmp('foo') # => 0
3707 * 'foo'.casecmp('FOO') # => 0
3708 * 'foo'.casecmp(1) # => nil
3710 * See {Case Mapping}[doc/case_mapping_rdoc.html].
3712 * Related: String#casecmp?.
3717 rb_str_casecmp(VALUE str1
, VALUE str2
)
3719 VALUE s
= rb_check_string_type(str2
);
3723 return str_casecmp(str1
, s
);
3727 str_casecmp(VALUE str1
, VALUE str2
)
3731 const char *p1
, *p1end
, *p2
, *p2end
;
3733 enc
= rb_enc_compatible(str1
, str2
);
3738 p1
= RSTRING_PTR(str1
); p1end
= RSTRING_END(str1
);
3739 p2
= RSTRING_PTR(str2
); p2end
= RSTRING_END(str2
);
3740 if (single_byte_optimizable(str1
) && single_byte_optimizable(str2
)) {
3741 while (p1
< p1end
&& p2
< p2end
) {
3743 unsigned int c1
= TOLOWER(*p1
& 0xff);
3744 unsigned int c2
= TOLOWER(*p2
& 0xff);
3746 return INT2FIX(c1
< c2
? -1 : 1);
3753 while (p1
< p1end
&& p2
< p2end
) {
3754 int l1
, c1
= rb_enc_ascget(p1
, p1end
, &l1
, enc
);
3755 int l2
, c2
= rb_enc_ascget(p2
, p2end
, &l2
, enc
);
3757 if (0 <= c1
&& 0 <= c2
) {
3761 return INT2FIX(c1
< c2
? -1 : 1);
3765 l1
= rb_enc_mbclen(p1
, p1end
, enc
);
3766 l2
= rb_enc_mbclen(p2
, p2end
, enc
);
3767 len
= l1
< l2
? l1
: l2
;
3768 r
= memcmp(p1
, p2
, len
);
3770 return INT2FIX(r
< 0 ? -1 : 1);
3772 return INT2FIX(l1
< l2
? -1 : 1);
3778 if (RSTRING_LEN(str1
) == RSTRING_LEN(str2
)) return INT2FIX(0);
3779 if (RSTRING_LEN(str1
) > RSTRING_LEN(str2
)) return INT2FIX(1);
3785 * casecmp?(other_string) -> true, false, or nil
3787 * Returns +true+ if +self+ and +other_string+ are equal after
3788 * Unicode case folding, otherwise +false+:
3790 * 'foo'.casecmp?('foo') # => true
3791 * 'foo'.casecmp?('food') # => false
3792 * 'food'.casecmp?('foo') # => false
3793 * 'FOO'.casecmp?('foo') # => true
3794 * 'foo'.casecmp?('FOO') # => true
3796 * Returns +nil+ if the two values are incomparable:
3798 * 'foo'.casecmp?(1) # => nil
3800 * See {Case Mapping}[doc/case_mapping_rdoc.html].
3802 * Related: String#casecmp.
3807 rb_str_casecmp_p(VALUE str1
, VALUE str2
)
3809 VALUE s
= rb_check_string_type(str2
);
3813 return str_casecmp_p(str1
, s
);
3817 str_casecmp_p(VALUE str1
, VALUE str2
)
3820 VALUE folded_str1
, folded_str2
;
3821 VALUE fold_opt
= sym_fold
;
3823 enc
= rb_enc_compatible(str1
, str2
);
3828 folded_str1
= rb_str_downcase(1, &fold_opt
, str1
);
3829 folded_str2
= rb_str_downcase(1, &fold_opt
, str2
);
3831 return rb_str_eql(folded_str1
, folded_str2
);
3835 strseq_core(const char *str_ptr
, const char *str_ptr_end
, long str_len
,
3836 const char *sub_ptr
, long sub_len
, long offset
, rb_encoding
*enc
)
3838 const char *search_start
= str_ptr
;
3839 long pos
, search_len
= str_len
- offset
;
3843 pos
= rb_memsearch(sub_ptr
, sub_len
, search_start
, search_len
, enc
);
3844 if (pos
< 0) return pos
;
3845 t
= rb_enc_right_char_head(search_start
, search_start
+pos
, str_ptr_end
, enc
);
3846 if (t
== search_start
+ pos
) break;
3847 search_len
-= t
- search_start
;
3848 if (search_len
<= 0) return -1;
3849 offset
+= t
- search_start
;
3852 return pos
+ offset
;
3855 #define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
3858 rb_strseq_index(VALUE str
, VALUE sub
, long offset
, int in_byte
)
3860 const char *str_ptr
, *str_ptr_end
, *sub_ptr
;
3861 long str_len
, sub_len
;
3864 enc
= rb_enc_check(str
, sub
);
3865 if (is_broken_string(sub
)) return -1;
3867 str_ptr
= RSTRING_PTR(str
);
3868 str_ptr_end
= RSTRING_END(str
);
3869 str_len
= RSTRING_LEN(str
);
3870 sub_ptr
= RSTRING_PTR(sub
);
3871 sub_len
= RSTRING_LEN(sub
);
3873 if (str_len
< sub_len
) return -1;
3876 long str_len_char
, sub_len_char
;
3877 int single_byte
= single_byte_optimizable(str
);
3878 str_len_char
= (in_byte
|| single_byte
) ? str_len
: str_strlen(str
, enc
);
3879 sub_len_char
= in_byte
? sub_len
: str_strlen(sub
, enc
);
3881 offset
+= str_len_char
;
3882 if (offset
< 0) return -1;
3884 if (str_len_char
- offset
< sub_len_char
) return -1;
3885 if (!in_byte
) offset
= str_offset(str_ptr
, str_ptr_end
, offset
, enc
, single_byte
);
3888 if (sub_len
== 0) return offset
;
3890 /* need proceed one character at a time */
3891 return strseq_core(str_ptr
, str_ptr_end
, str_len
, sub_ptr
, sub_len
, offset
, enc
);
3897 * index(substring, offset = 0) -> integer or nil
3898 * index(regexp, offset = 0) -> integer or nil
3900 * Returns the \Integer index of the first occurrence of the given +substring+,
3901 * or +nil+ if none found:
3903 * 'foo'.index('f') # => 0
3904 * 'foo'.index('o') # => 1
3905 * 'foo'.index('oo') # => 1
3906 * 'foo'.index('ooo') # => nil
3908 * Returns the \Integer index of the first match for the given \Regexp +regexp+,
3909 * or +nil+ if none found:
3911 * 'foo'.index(/f/) # => 0
3912 * 'foo'.index(/o/) # => 1
3913 * 'foo'.index(/oo/) # => 1
3914 * 'foo'.index(/ooo/) # => nil
3916 * \Integer argument +offset+, if given, specifies the position in the
3917 * string to begin the search:
3919 * 'foo'.index('o', 1) # => 1
3920 * 'foo'.index('o', 2) # => 2
3921 * 'foo'.index('o', 3) # => nil
3923 * If +offset+ is negative, counts backward from the end of +self+:
3925 * 'foo'.index('o', -1) # => 2
3926 * 'foo'.index('o', -2) # => 1
3927 * 'foo'.index('o', -3) # => 1
3928 * 'foo'.index('o', -4) # => nil
3930 * Related: String#rindex.
3934 rb_str_index_m(int argc
, VALUE
*argv
, VALUE str
)
3940 if (rb_scan_args(argc
, argv
, "11", &sub
, &initpos
) == 2) {
3941 pos
= NUM2LONG(initpos
);
3947 pos
+= str_strlen(str
, NULL
);
3949 if (RB_TYPE_P(sub
, T_REGEXP
)) {
3950 rb_backref_set(Qnil
);
3956 if (RB_TYPE_P(sub
, T_REGEXP
)) {
3957 if (pos
> str_strlen(str
, NULL
))
3959 pos
= str_offset(RSTRING_PTR(str
), RSTRING_END(str
), pos
,
3960 rb_enc_check(str
, sub
), single_byte_optimizable(str
));
3962 if (rb_reg_search(sub
, str
, pos
, 0) < 0) {
3966 VALUE match
= rb_backref_get();
3967 struct re_registers
*regs
= RMATCH_REGS(match
);
3968 pos
= rb_str_sublen(str
, BEG(0));
3969 return LONG2NUM(pos
);
3974 pos
= rb_str_index(str
, sub
, pos
);
3975 pos
= rb_str_sublen(str
, pos
);
3978 if (pos
== -1) return Qnil
;
3979 return LONG2NUM(pos
);
3984 str_rindex(VALUE str
, VALUE sub
, const char *s
, long pos
, rb_encoding
*enc
)
3986 char *hit
, *adjusted
;
3988 long slen
, searchlen
;
3991 slen
= RSTRING_LEN(sub
);
3992 if (slen
== 0) return pos
;
3993 sbeg
= RSTRING_PTR(str
);
3994 e
= RSTRING_END(str
);
3995 t
= RSTRING_PTR(sub
);
3997 searchlen
= s
- sbeg
+ 1;
4000 hit
= memrchr(sbeg
, c
, searchlen
);
4002 adjusted
= rb_enc_left_char_head(sbeg
, hit
, e
, enc
);
4003 if (hit
!= adjusted
) {
4004 searchlen
= adjusted
- sbeg
;
4007 if (memcmp(hit
, t
, slen
) == 0)
4008 return rb_str_sublen(str
, hit
- sbeg
);
4009 searchlen
= adjusted
- sbeg
;
4010 } while (searchlen
> 0);
4016 str_rindex(VALUE str
, VALUE sub
, const char *s
, long pos
, rb_encoding
*enc
)
4021 sbeg
= RSTRING_PTR(str
);
4022 e
= RSTRING_END(str
);
4023 t
= RSTRING_PTR(sub
);
4024 slen
= RSTRING_LEN(sub
);
4027 if (memcmp(s
, t
, slen
) == 0) {
4030 if (pos
== 0) break;
4032 s
= rb_enc_prev_char(sbeg
, s
, e
, enc
);
4040 rb_str_rindex(VALUE str
, VALUE sub
, long pos
)
4047 enc
= rb_enc_check(str
, sub
);
4048 if (is_broken_string(sub
)) return -1;
4049 singlebyte
= single_byte_optimizable(str
);
4050 len
= singlebyte
? RSTRING_LEN(str
) : str_strlen(str
, enc
); /* rb_enc_check */
4051 slen
= str_strlen(sub
, enc
); /* rb_enc_check */
4053 /* substring longer than string */
4054 if (len
< slen
) return -1;
4055 if (len
- pos
< slen
) pos
= len
- slen
;
4056 if (len
== 0) return pos
;
4058 sbeg
= RSTRING_PTR(str
);
4061 if (memcmp(sbeg
, RSTRING_PTR(sub
), RSTRING_LEN(sub
)) == 0)
4067 s
= str_nth(sbeg
, RSTRING_END(str
), pos
, enc
, singlebyte
);
4068 return str_rindex(str
, sub
, s
, pos
, enc
);
4073 * rindex(substring, offset = self.length) -> integer or nil
4074 * rindex(regexp, offset = self.length) -> integer or nil
4076 * Returns the \Integer index of the _last_ occurrence of the given +substring+,
4077 * or +nil+ if none found:
4079 * 'foo'.rindex('f') # => 0
4080 * 'foo'.rindex('o') # => 2
4081 * 'foo'.rindex('oo') # => 1
4082 * 'foo'.rindex('ooo') # => nil
4084 * Returns the \Integer index of the _last_ match for the given \Regexp +regexp+,
4085 * or +nil+ if none found:
4087 * 'foo'.rindex(/f/) # => 0
4088 * 'foo'.rindex(/o/) # => 2
4089 * 'foo'.rindex(/oo/) # => 1
4090 * 'foo'.rindex(/ooo/) # => nil
4092 * The _last_ match means starting at the possible last position, not
4093 * the last of longest matches.
4095 * 'foo'.rindex(/o+/) # => 2
4096 * $~ #=> #<MatchData "o">
4098 * To get the last longest match, needs to combine with negative
4101 * 'foo'.rindex(/(?<!o)o+/) # => 1
4102 * $~ #=> #<MatchData "oo">
4104 * Or String#index with negative lookforward.
4106 * 'foo'.index(/o+(?!.*o)/) # => 1
4107 * $~ #=> #<MatchData "oo">
4109 * \Integer argument +offset+, if given and non-negative, specifies the maximum starting position in the
4110 * string to _end_ the search:
4112 * 'foo'.rindex('o', 0) # => nil
4113 * 'foo'.rindex('o', 1) # => 1
4114 * 'foo'.rindex('o', 2) # => 2
4115 * 'foo'.rindex('o', 3) # => 2
4117 * If +offset+ is a negative \Integer, the maximum starting position in the
4118 * string to _end_ the search is the sum of the string's length and +offset+:
4120 * 'foo'.rindex('o', -1) # => 2
4121 * 'foo'.rindex('o', -2) # => 1
4122 * 'foo'.rindex('o', -3) # => nil
4123 * 'foo'.rindex('o', -4) # => nil
4125 * Related: String#index.
4129 rb_str_rindex_m(int argc
, VALUE
*argv
, VALUE str
)
4133 rb_encoding
*enc
= STR_ENC_GET(str
);
4134 long pos
, len
= str_strlen(str
, enc
); /* str's enc */
4136 if (rb_scan_args(argc
, argv
, "11", &sub
, &vpos
) == 2) {
4137 pos
= NUM2LONG(vpos
);
4141 if (RB_TYPE_P(sub
, T_REGEXP
)) {
4142 rb_backref_set(Qnil
);
4147 if (pos
> len
) pos
= len
;
4153 if (RB_TYPE_P(sub
, T_REGEXP
)) {
4154 /* enc = rb_get_check(str, sub); */
4155 pos
= str_offset(RSTRING_PTR(str
), RSTRING_END(str
), pos
,
4156 enc
, single_byte_optimizable(str
));
4158 if (rb_reg_search(sub
, str
, pos
, 1) >= 0) {
4159 VALUE match
= rb_backref_get();
4160 struct re_registers
*regs
= RMATCH_REGS(match
);
4161 pos
= rb_str_sublen(str
, BEG(0));
4162 return LONG2NUM(pos
);
4167 pos
= rb_str_rindex(str
, sub
, pos
);
4168 if (pos
>= 0) return LONG2NUM(pos
);
4175 * string =~ regexp -> integer or nil
4176 * string =~ object -> integer or nil
4178 * Returns the \Integer index of the first substring that matches
4179 * the given +regexp+, or +nil+ if no match found:
4181 * 'foo' =~ /f/ # => 0
4182 * 'foo' =~ /o/ # => 1
4183 * 'foo' =~ /x/ # => nil
4185 * Note: also updates
4186 * {Regexp-related global variables}[Regexp.html#class-Regexp-label-Special+global+variables].
4188 * If the given +object+ is not a \Regexp, returns the value
4189 * returned by <tt>object =~ self</tt>.
4191 * Note that <tt>string =~ regexp</tt> is different from <tt>regexp =~ string</tt>
4192 * (see {Regexp#=~}[https://ruby-doc.org/core-2.7.1/Regexp.html#method-i-3D-7E]):
4195 * "no. 9" =~ /(?<number>\d+)/
4196 * number # => nil (not assigned)
4197 * /(?<number>\d+)/ =~ "no. 9"
4203 rb_str_match(VALUE x
, VALUE y
)
4205 switch (OBJ_BUILTIN_TYPE(y
)) {
4207 rb_raise(rb_eTypeError
, "type mismatch: String given");
4210 return rb_reg_match(y
, x
);
4213 return rb_funcall(y
, idEqTilde
, 1, x
);
4218 static VALUE
get_pat(VALUE
);
4223 * match(pattern, offset = 0) -> matchdata or nil
4224 * match(pattern, offset = 0) {|matchdata| ... } -> object
4226 * Returns a \Matchdata object (or +nil+) based on +self+ and the given +pattern+.
4228 * Note: also updates
4229 * {Regexp-related global variables}[Regexp.html#class-Regexp-label-Special+global+variables].
4231 * - Computes +regexp+ by converting +pattern+ (if not already a \Regexp).
4232 * regexp = Regexp.new(pattern)
4233 * - Computes +matchdata+, which will be either a \MatchData object or +nil+
4234 * (see Regexp#match):
4235 * matchdata = <tt>regexp.match(self)
4237 * With no block given, returns the computed +matchdata+:
4239 * 'foo'.match('f') # => #<MatchData "f">
4240 * 'foo'.match('o') # => #<MatchData "o">
4241 * 'foo'.match('x') # => nil
4243 * If \Integer argument +offset+ is given, the search begins at index +offset+:
4245 * 'foo'.match('f', 1) # => nil
4246 * 'foo'.match('o', 1) # => #<MatchData "o">
4248 * With a block given, calls the block with the computed +matchdata+
4249 * and returns the block's return value:
4251 * 'foo'.match(/o/) {|matchdata| matchdata } # => #<MatchData "o">
4252 * 'foo'.match(/x/) {|matchdata| matchdata } # => nil
4253 * 'foo'.match(/f/, 1) {|matchdata| matchdata } # => nil
4258 rb_str_match_m(int argc
, VALUE
*argv
, VALUE str
)
4262 rb_check_arity(argc
, 1, 2);
4265 result
= rb_funcallv(get_pat(re
), rb_intern("match"), argc
, argv
);
4266 if (!NIL_P(result
) && rb_block_given_p()) {
4267 return rb_yield(result
);
4274 * match?(pattern, offset = 0) -> true or false
4276 * Returns +true+ or +false+ based on whether a match is found for +self+ and +pattern+.
4278 * Note: does not update
4279 * {Regexp-related global variables}[Regexp.html#class-Regexp-label-Special+global+variables].
4281 * Computes +regexp+ by converting +pattern+ (if not already a \Regexp).
4282 * regexp = Regexp.new(pattern)
4284 * Returns +true+ if <tt>self+.match(regexp)</tt> returns a \Matchdata object,
4285 * +false+ otherwise:
4287 * 'foo'.match?(/o/) # => true
4288 * 'foo'.match?('o') # => true
4289 * 'foo'.match?(/x/) # => false
4291 * If \Integer argument +offset+ is given, the search begins at index +offset+:
4292 * 'foo'.match?('f', 1) # => false
4293 * 'foo'.match?('o', 1) # => true
4298 rb_str_match_m_p(int argc
, VALUE
*argv
, VALUE str
)
4301 rb_check_arity(argc
, 1, 2);
4302 re
= get_pat(argv
[0]);
4303 return rb_reg_match_p(re
, str
, argc
> 1 ? NUM2LONG(argv
[1]) : 0);
4306 enum neighbor_char
{
4312 static enum neighbor_char
4313 enc_succ_char(char *p
, long len
, rb_encoding
*enc
)
4318 if (rb_enc_mbminlen(enc
) > 1) {
4319 /* wchar, trivial case */
4320 int r
= rb_enc_precise_mbclen(p
, p
+ len
, enc
), c
;
4321 if (!MBCLEN_CHARFOUND_P(r
)) {
4322 return NEIGHBOR_NOT_CHAR
;
4324 c
= rb_enc_mbc_to_codepoint(p
, p
+ len
, enc
) + 1;
4325 l
= rb_enc_code_to_mbclen(c
, enc
);
4326 if (!l
) return NEIGHBOR_NOT_CHAR
;
4327 if (l
!= len
) return NEIGHBOR_WRAPPED
;
4328 rb_enc_mbcput(c
, p
, enc
);
4329 r
= rb_enc_precise_mbclen(p
, p
+ len
, enc
);
4330 if (!MBCLEN_CHARFOUND_P(r
)) {
4331 return NEIGHBOR_NOT_CHAR
;
4333 return NEIGHBOR_FOUND
;
4336 for (i
= len
-1; 0 <= i
&& (unsigned char)p
[i
] == 0xff; i
--)
4339 return NEIGHBOR_WRAPPED
;
4340 ++((unsigned char*)p
)[i
];
4341 l
= rb_enc_precise_mbclen(p
, p
+len
, enc
);
4342 if (MBCLEN_CHARFOUND_P(l
)) {
4343 l
= MBCLEN_CHARFOUND_LEN(l
);
4345 return NEIGHBOR_FOUND
;
4348 memset(p
+l
, 0xff, len
-l
);
4351 if (MBCLEN_INVALID_P(l
) && i
< len
-1) {
4354 for (len2
= len
-1; 0 < len2
; len2
--) {
4355 l2
= rb_enc_precise_mbclen(p
, p
+len2
, enc
);
4356 if (!MBCLEN_INVALID_P(l2
))
4359 memset(p
+len2
+1, 0xff, len
-(len2
+1));
4364 static enum neighbor_char
4365 enc_pred_char(char *p
, long len
, rb_encoding
*enc
)
4369 if (rb_enc_mbminlen(enc
) > 1) {
4370 /* wchar, trivial case */
4371 int r
= rb_enc_precise_mbclen(p
, p
+ len
, enc
), c
;
4372 if (!MBCLEN_CHARFOUND_P(r
)) {
4373 return NEIGHBOR_NOT_CHAR
;
4375 c
= rb_enc_mbc_to_codepoint(p
, p
+ len
, enc
);
4376 if (!c
) return NEIGHBOR_NOT_CHAR
;
4378 l
= rb_enc_code_to_mbclen(c
, enc
);
4379 if (!l
) return NEIGHBOR_NOT_CHAR
;
4380 if (l
!= len
) return NEIGHBOR_WRAPPED
;
4381 rb_enc_mbcput(c
, p
, enc
);
4382 r
= rb_enc_precise_mbclen(p
, p
+ len
, enc
);
4383 if (!MBCLEN_CHARFOUND_P(r
)) {
4384 return NEIGHBOR_NOT_CHAR
;
4386 return NEIGHBOR_FOUND
;
4389 for (i
= len
-1; 0 <= i
&& (unsigned char)p
[i
] == 0; i
--)
4392 return NEIGHBOR_WRAPPED
;
4393 --((unsigned char*)p
)[i
];
4394 l
= rb_enc_precise_mbclen(p
, p
+len
, enc
);
4395 if (MBCLEN_CHARFOUND_P(l
)) {
4396 l
= MBCLEN_CHARFOUND_LEN(l
);
4398 return NEIGHBOR_FOUND
;
4401 memset(p
+l
, 0, len
-l
);
4404 if (MBCLEN_INVALID_P(l
) && i
< len
-1) {
4407 for (len2
= len
-1; 0 < len2
; len2
--) {
4408 l2
= rb_enc_precise_mbclen(p
, p
+len2
, enc
);
4409 if (!MBCLEN_INVALID_P(l2
))
4412 memset(p
+len2
+1, 0, len
-(len2
+1));
4418 overwrite +p+ by succeeding letter in +enc+ and returns
4419 NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
4420 When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
4421 assuming each ranges are successive, and mbclen
4422 never change in each ranges.
4423 NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
4426 static enum neighbor_char
4427 enc_succ_alnum_char(char *p
, long len
, rb_encoding
*enc
, char *carry
)
4429 enum neighbor_char ret
;
4433 char save
[ONIGENC_CODE_TO_MBC_MAXLEN
];
4435 /* skip 03A2, invalid char between GREEK CAPITAL LETTERS */
4437 const int max_gaps
= 1;
4439 c
= rb_enc_mbc_to_codepoint(p
, p
+len
, enc
);
4440 if (rb_enc_isctype(c
, ONIGENC_CTYPE_DIGIT
, enc
))
4441 ctype
= ONIGENC_CTYPE_DIGIT
;
4442 else if (rb_enc_isctype(c
, ONIGENC_CTYPE_ALPHA
, enc
))
4443 ctype
= ONIGENC_CTYPE_ALPHA
;
4445 return NEIGHBOR_NOT_CHAR
;
4447 MEMCPY(save
, p
, char, len
);
4448 for (try = 0; try <= max_gaps
; ++try) {
4449 ret
= enc_succ_char(p
, len
, enc
);
4450 if (ret
== NEIGHBOR_FOUND
) {
4451 c
= rb_enc_mbc_to_codepoint(p
, p
+len
, enc
);
4452 if (rb_enc_isctype(c
, ctype
, enc
))
4453 return NEIGHBOR_FOUND
;
4456 MEMCPY(p
, save
, char, len
);
4459 MEMCPY(save
, p
, char, len
);
4460 ret
= enc_pred_char(p
, len
, enc
);
4461 if (ret
== NEIGHBOR_FOUND
) {
4462 c
= rb_enc_mbc_to_codepoint(p
, p
+len
, enc
);
4463 if (!rb_enc_isctype(c
, ctype
, enc
)) {
4464 MEMCPY(p
, save
, char, len
);
4469 MEMCPY(p
, save
, char, len
);
4475 return NEIGHBOR_NOT_CHAR
;
4478 if (ctype
!= ONIGENC_CTYPE_DIGIT
) {
4479 MEMCPY(carry
, p
, char, len
);
4480 return NEIGHBOR_WRAPPED
;
4483 MEMCPY(carry
, p
, char, len
);
4484 enc_succ_char(carry
, len
, enc
);
4485 return NEIGHBOR_WRAPPED
;
4489 static VALUE
str_succ(VALUE str
);
4495 * Returns the successor to +self+. The successor is calculated by
4496 * incrementing characters.
4498 * The first character to be incremented is the rightmost alphanumeric:
4499 * or, if no alphanumerics, the rightmost character:
4501 * 'THX1138'.succ # => "THX1139"
4502 * '<<koala>>'.succ # => "<<koalb>>"
4503 * '***'.succ # => '**+'
4505 * The successor to a digit is another digit, "carrying" to the next-left
4506 * character for a "rollover" from 9 to 0, and prepending another digit
4509 * '00'.succ # => "01"
4510 * '09'.succ # => "10"
4511 * '99'.succ # => "100"
4513 * The successor to a letter is another letter of the same case,
4514 * carrying to the next-left character for a rollover,
4515 * and prepending another same-case letter if necessary:
4517 * 'aa'.succ # => "ab"
4518 * 'az'.succ # => "ba"
4519 * 'zz'.succ # => "aaa"
4520 * 'AA'.succ # => "AB"
4521 * 'AZ'.succ # => "BA"
4522 * 'ZZ'.succ # => "AAA"
4524 * The successor to a non-alphanumeric character is the next character
4525 * in the underlying character set's collating sequence,
4526 * carrying to the next-left character for a rollover,
4527 * and prepending another character if necessary:
4530 * s # => "\x00\x00\x00"
4531 * s.succ # => "\x00\x00\x01"
4533 * s # => "\xFF\xFF\xFF"
4534 * s.succ # => "\x01\x00\x00\x00"
4536 * Carrying can occur between and among mixtures of alphanumeric characters:
4539 * s.succ # => "aaa00aa00"
4541 * s.succ # => "100aa00aa"
4543 * The successor to an empty \String is a new empty \String:
4547 * String#next is an alias for String#succ.
4551 rb_str_succ(VALUE orig
)
4554 str
= rb_str_new(RSTRING_PTR(orig
), RSTRING_LEN(orig
));
4555 rb_enc_cr_str_copy_for_substr(str
, orig
);
4556 return str_succ(str
);
4563 char *sbeg
, *s
, *e
, *last_alnum
= 0;
4564 int found_alnum
= 0;
4566 char carry
[ONIGENC_CODE_TO_MBC_MAXLEN
] = "\1";
4567 long carry_pos
= 0, carry_len
= 1;
4568 enum neighbor_char neighbor
= NEIGHBOR_FOUND
;
4570 slen
= RSTRING_LEN(str
);
4571 if (slen
== 0) return str
;
4573 enc
= STR_ENC_GET(str
);
4574 sbeg
= RSTRING_PTR(str
);
4575 s
= e
= sbeg
+ slen
;
4577 while ((s
= rb_enc_prev_char(sbeg
, s
, e
, enc
)) != 0) {
4578 if (neighbor
== NEIGHBOR_NOT_CHAR
&& last_alnum
) {
4579 if (ISALPHA(*last_alnum
) ? ISDIGIT(*s
) :
4580 ISDIGIT(*last_alnum
) ? ISALPHA(*s
) : 0) {
4584 l
= rb_enc_precise_mbclen(s
, e
, enc
);
4585 if (!ONIGENC_MBCLEN_CHARFOUND_P(l
)) continue;
4586 l
= ONIGENC_MBCLEN_CHARFOUND_LEN(l
);
4587 neighbor
= enc_succ_alnum_char(s
, l
, enc
, carry
);
4589 case NEIGHBOR_NOT_CHAR
:
4591 case NEIGHBOR_FOUND
:
4593 case NEIGHBOR_WRAPPED
:
4598 carry_pos
= s
- sbeg
;
4601 if (!found_alnum
) { /* str contains no alnum */
4603 while ((s
= rb_enc_prev_char(sbeg
, s
, e
, enc
)) != 0) {
4604 enum neighbor_char neighbor
;
4605 char tmp
[ONIGENC_CODE_TO_MBC_MAXLEN
];
4606 l
= rb_enc_precise_mbclen(s
, e
, enc
);
4607 if (!ONIGENC_MBCLEN_CHARFOUND_P(l
)) continue;
4608 l
= ONIGENC_MBCLEN_CHARFOUND_LEN(l
);
4609 MEMCPY(tmp
, s
, char, l
);
4610 neighbor
= enc_succ_char(tmp
, l
, enc
);
4612 case NEIGHBOR_FOUND
:
4613 MEMCPY(s
, tmp
, char, l
);
4616 case NEIGHBOR_WRAPPED
:
4617 MEMCPY(s
, tmp
, char, l
);
4619 case NEIGHBOR_NOT_CHAR
:
4622 if (rb_enc_precise_mbclen(s
, s
+l
, enc
) != l
) {
4623 /* wrapped to \0...\0. search next valid char. */
4624 enc_succ_char(s
, l
, enc
);
4626 if (!rb_enc_asciicompat(enc
)) {
4627 MEMCPY(carry
, s
, char, l
);
4630 carry_pos
= s
- sbeg
;
4632 ENC_CODERANGE_SET(str
, ENC_CODERANGE_UNKNOWN
);
4634 RESIZE_CAPA(str
, slen
+ carry_len
);
4635 sbeg
= RSTRING_PTR(str
);
4636 s
= sbeg
+ carry_pos
;
4637 memmove(s
+ carry_len
, s
, slen
- carry_pos
);
4638 memmove(s
, carry
, carry_len
);
4640 STR_SET_LEN(str
, slen
);
4641 TERM_FILL(&sbeg
[slen
], rb_enc_mbminlen(enc
));
4642 rb_enc_str_coderange(str
);
4651 * Equivalent to String#succ, but modifies +self+ in place; returns +self+.
4653 * String#next! is an alias for String#succ!.
4657 rb_str_succ_bang(VALUE str
)
4665 all_digits_p(const char *s
, long len
)
4668 if (!ISDIGIT(*s
)) return 0;
4675 str_upto_i(VALUE str
, VALUE arg
)
4683 * upto(other_string, exclusive = false) {|string| ... } -> self
4684 * upto(other_string, exclusive = false) -> new_enumerator
4686 * With a block given, calls the block with each \String value
4687 * returned by successive calls to String#succ;
4688 * the first value is +self+, the next is <tt>self.succ</tt>, and so on;
4689 * the sequence terminates when value +other_string+ is reached;
4692 * 'a8'.upto('b6') {|s| print s, ' ' } # => "a8"
4695 * a8 a9 b0 b1 b2 b3 b4 b5 b6
4697 * If argument +exclusive+ is given as a truthy object, the last value is omitted:
4699 * 'a8'.upto('b6', true) {|s| print s, ' ' } # => "a8"
4703 * a8 a9 b0 b1 b2 b3 b4 b5
4705 * If +other_string+ would not be reached, does not call the block:
4707 * '25'.upto('5') {|s| fail s }
4708 * 'aa'.upto('a') {|s| fail s }
4710 * With no block given, returns a new \Enumerator:
4712 * 'a8'.upto('b6') # => #<Enumerator: "a8":upto("b6")>
4717 rb_str_upto(int argc
, VALUE
*argv
, VALUE beg
)
4719 VALUE end
, exclusive
;
4721 rb_scan_args(argc
, argv
, "11", &end
, &exclusive
);
4722 RETURN_ENUMERATOR(beg
, argc
, argv
);
4723 return rb_str_upto_each(beg
, end
, RTEST(exclusive
), str_upto_i
, Qnil
);
4727 rb_str_upto_each(VALUE beg
, VALUE end
, int excl
, int (*each
)(VALUE
, VALUE
), VALUE arg
)
4729 VALUE current
, after_end
;
4734 CONST_ID(succ
, "succ");
4736 enc
= rb_enc_check(beg
, end
);
4737 ascii
= (is_ascii_string(beg
) && is_ascii_string(end
));
4738 /* single character */
4739 if (RSTRING_LEN(beg
) == 1 && RSTRING_LEN(end
) == 1 && ascii
) {
4740 char c
= RSTRING_PTR(beg
)[0];
4741 char e
= RSTRING_PTR(end
)[0];
4743 if (c
> e
|| (excl
&& c
== e
)) return beg
;
4745 if ((*each
)(rb_enc_str_new(&c
, 1, enc
), arg
)) break;
4746 if (!excl
&& c
== e
) break;
4748 if (excl
&& c
== e
) break;
4752 /* both edges are all digits */
4753 if (ascii
&& ISDIGIT(RSTRING_PTR(beg
)[0]) && ISDIGIT(RSTRING_PTR(end
)[0]) &&
4754 all_digits_p(RSTRING_PTR(beg
), RSTRING_LEN(beg
)) &&
4755 all_digits_p(RSTRING_PTR(end
), RSTRING_LEN(end
))) {
4759 width
= RSTRING_LENINT(beg
);
4760 b
= rb_str_to_inum(beg
, 10, FALSE
);
4761 e
= rb_str_to_inum(end
, 10, FALSE
);
4762 if (FIXNUM_P(b
) && FIXNUM_P(e
)) {
4763 long bi
= FIX2LONG(b
);
4764 long ei
= FIX2LONG(e
);
4765 rb_encoding
*usascii
= rb_usascii_encoding();
4768 if (excl
&& bi
== ei
) break;
4769 if ((*each
)(rb_enc_sprintf(usascii
, "%.*ld", width
, bi
), arg
)) break;
4774 ID op
= excl
? '<' : idLE
;
4775 VALUE args
[2], fmt
= rb_fstring_lit("%.*d");
4777 args
[0] = INT2FIX(width
);
4778 while (rb_funcall(b
, op
, 1, e
)) {
4780 if ((*each
)(rb_str_format(numberof(args
), args
, fmt
), arg
)) break;
4781 b
= rb_funcallv(b
, succ
, 0, 0);
4787 n
= rb_str_cmp(beg
, end
);
4788 if (n
> 0 || (excl
&& n
== 0)) return beg
;
4790 after_end
= rb_funcallv(end
, succ
, 0, 0);
4791 current
= str_duplicate(rb_cString
, beg
);
4792 while (!rb_str_equal(current
, after_end
)) {
4794 if (excl
|| !rb_str_equal(current
, end
))
4795 next
= rb_funcallv(current
, succ
, 0, 0);
4796 if ((*each
)(current
, arg
)) break;
4797 if (NIL_P(next
)) break;
4799 StringValue(current
);
4800 if (excl
&& rb_str_equal(current
, end
)) break;
4801 if (RSTRING_LEN(current
) > RSTRING_LEN(end
) || RSTRING_LEN(current
) == 0)
4809 rb_str_upto_endless_each(VALUE beg
, int (*each
)(VALUE
, VALUE
), VALUE arg
)
4814 CONST_ID(succ
, "succ");
4815 /* both edges are all digits */
4816 if (is_ascii_string(beg
) && ISDIGIT(RSTRING_PTR(beg
)[0]) &&
4817 all_digits_p(RSTRING_PTR(beg
), RSTRING_LEN(beg
))) {
4818 VALUE b
, args
[2], fmt
= rb_fstring_lit("%.*d");
4819 int width
= RSTRING_LENINT(beg
);
4820 b
= rb_str_to_inum(beg
, 10, FALSE
);
4822 long bi
= FIX2LONG(b
);
4823 rb_encoding
*usascii
= rb_usascii_encoding();
4825 while (FIXABLE(bi
)) {
4826 if ((*each
)(rb_enc_sprintf(usascii
, "%.*ld", width
, bi
), arg
)) break;
4831 args
[0] = INT2FIX(width
);
4834 if ((*each
)(rb_str_format(numberof(args
), args
, fmt
), arg
)) break;
4835 b
= rb_funcallv(b
, succ
, 0, 0);
4839 current
= str_duplicate(rb_cString
, beg
);
4841 VALUE next
= rb_funcallv(current
, succ
, 0, 0);
4842 if ((*each
)(current
, arg
)) break;
4844 StringValue(current
);
4845 if (RSTRING_LEN(current
) == 0)
4853 include_range_i(VALUE str
, VALUE arg
)
4855 VALUE
*argp
= (VALUE
*)arg
;
4856 if (!rb_equal(str
, *argp
)) return 0;
4862 rb_str_include_range_p(VALUE beg
, VALUE end
, VALUE val
, VALUE exclusive
)
4864 beg
= rb_str_new_frozen(beg
);
4866 end
= rb_str_new_frozen(end
);
4867 if (NIL_P(val
)) return Qfalse
;
4868 val
= rb_check_string_type(val
);
4869 if (NIL_P(val
)) return Qfalse
;
4870 if (rb_enc_asciicompat(STR_ENC_GET(beg
)) &&
4871 rb_enc_asciicompat(STR_ENC_GET(end
)) &&
4872 rb_enc_asciicompat(STR_ENC_GET(val
))) {
4873 const char *bp
= RSTRING_PTR(beg
);
4874 const char *ep
= RSTRING_PTR(end
);
4875 const char *vp
= RSTRING_PTR(val
);
4876 if (RSTRING_LEN(beg
) == 1 && RSTRING_LEN(end
) == 1) {
4877 if (RSTRING_LEN(val
) == 0 || RSTRING_LEN(val
) > 1)
4884 if (ISASCII(b
) && ISASCII(e
) && ISASCII(v
)) {
4885 if (b
<= v
&& v
< e
) return Qtrue
;
4886 return RBOOL(!RTEST(exclusive
) && v
== e
);
4891 /* both edges are all digits */
4892 if (ISDIGIT(*bp
) && ISDIGIT(*ep
) &&
4893 all_digits_p(bp
, RSTRING_LEN(beg
)) &&
4894 all_digits_p(ep
, RSTRING_LEN(end
))) {
4899 rb_str_upto_each(beg
, end
, RTEST(exclusive
), include_range_i
, (VALUE
)&val
);
4901 return RBOOL(NIL_P(val
));
4905 rb_str_subpat(VALUE str
, VALUE re
, VALUE backref
)
4907 if (rb_reg_search(re
, str
, 0, 0) >= 0) {
4908 VALUE match
= rb_backref_get();
4909 int nth
= rb_reg_backref_number(match
, backref
);
4910 return rb_reg_nth_match(nth
, match
);
4916 rb_str_aref(VALUE str
, VALUE indx
)
4920 if (FIXNUM_P(indx
)) {
4921 idx
= FIX2LONG(indx
);
4923 else if (RB_TYPE_P(indx
, T_REGEXP
)) {
4924 return rb_str_subpat(str
, indx
, INT2FIX(0));
4926 else if (RB_TYPE_P(indx
, T_STRING
)) {
4927 if (rb_str_index(str
, indx
, 0) != -1)
4928 return str_duplicate(rb_cString
, indx
);
4932 /* check if indx is Range */
4933 long beg
, len
= str_strlen(str
, NULL
);
4934 switch (rb_range_beg_len(indx
, &beg
, &len
, len
, 0)) {
4940 return rb_str_substr(str
, beg
, len
);
4942 idx
= NUM2LONG(indx
);
4945 return str_substr(str
, idx
, 1, FALSE
);
4951 * string[index] -> new_string or nil
4952 * string[start, length] -> new_string or nil
4953 * string[range] -> new_string or nil
4954 * string[regexp, capture = 0] -> new_string or nil
4955 * string[substring] -> new_string or nil
4957 * Returns the substring of +self+ specified by the arguments.
4959 * When the single \Integer argument +index+ is given,
4960 * returns the 1-character substring found in +self+ at offset +index+:
4964 * Counts backward from the end of +self+ if +index+ is negative:
4966 * 'foo'[-3] # => "f"
4968 * Returns +nil+ if +index+ is out of range:
4971 * 'foo'[-4] # => nil
4973 * When the two \Integer arguments +start+ and +length+ are given,
4974 * returns the substring of the given +length+ found in +self+ at offset +start+:
4976 * 'foo'[0, 2] # => "fo"
4977 * 'foo'[0, 0] # => ""
4979 * Counts backward from the end of +self+ if +start+ is negative:
4981 * 'foo'[-2, 2] # => "oo"
4983 * Special case: returns a new empty \String if +start+ is equal to the length of +self+:
4985 * 'foo'[3, 2] # => ""
4987 * Returns +nil+ if +start+ is out of range:
4989 * 'foo'[4, 2] # => nil
4990 * 'foo'[-4, 2] # => nil
4992 * Returns the trailing substring of +self+ if +length+ is large:
4994 * 'foo'[1, 50] # => "oo"
4996 * Returns +nil+ if +length+ is negative:
4998 * 'foo'[0, -1] # => nil
5000 * When the single \Range argument +range+ is given,
5001 * derives +start+ and +length+ values from the given +range+,
5002 * and returns values as above:
5004 * - <tt>'foo'[0..1]</tt> is equivalent to <tt>'foo'[0, 2]</tt>.
5005 * - <tt>'foo'[0...1]</tt> is equivalent to <tt>'foo'[0, 1]</tt>.
5007 * When the \Regexp argument +regexp+ is given,
5008 * and the +capture+ argument is <tt>0</tt>,
5009 * returns the first matching substring found in +self+,
5010 * or +nil+ if none found:
5012 * 'foo'[/o/] # => "o"
5013 * 'foo'[/x/] # => nil
5015 * s[/[aeiou](.)\1/] # => "ell"
5016 * s[/[aeiou](.)\1/, 0] # => "ell"
5018 * If argument +capture+ is given and not <tt>0</tt>,
5019 * it should be either an \Integer capture group index or a \String or \Symbol capture group name;
5020 * the method call returns only the specified capture
5021 * (see {Regexp Capturing}[Regexp.html#class-Regexp-label-Capturing]):
5024 * s[/[aeiou](.)\1/, 1] # => "l"
5025 * s[/(?<vowel>[aeiou])(?<non_vowel>[^aeiou])/, "non_vowel"] # => "l"
5026 * s[/(?<vowel>[aeiou])(?<non_vowel>[^aeiou])/, :vowel] # => "e"
5028 * If an invalid capture group index is given, +nil+ is returned. If an invalid
5029 * capture group name is given, +IndexError+ is raised.
5031 * When the single \String argument +substring+ is given,
5032 * returns the substring from +self+ if found, otherwise +nil+:
5034 * 'foo'['oo'] # => "oo"
5035 * 'foo'['xx'] # => nil
5037 * String#slice is an alias for String#[].
5041 rb_str_aref_m(int argc
, VALUE
*argv
, VALUE str
)
5044 if (RB_TYPE_P(argv
[0], T_REGEXP
)) {
5045 return rb_str_subpat(str
, argv
[0], argv
[1]);
5048 long beg
= NUM2LONG(argv
[0]);
5049 long len
= NUM2LONG(argv
[1]);
5050 return rb_str_substr(str
, beg
, len
);
5053 rb_check_arity(argc
, 1, 2);
5054 return rb_str_aref(str
, argv
[0]);
5058 rb_str_drop_bytes(VALUE str
, long len
)
5060 char *ptr
= RSTRING_PTR(str
);
5061 long olen
= RSTRING_LEN(str
), nlen
;
5063 str_modifiable(str
);
5064 if (len
> olen
) len
= olen
;
5066 if (str_embed_capa(str
) >= nlen
+ TERM_LEN(str
)) {
5068 int fl
= (int)(RBASIC(str
)->flags
& (STR_NOEMBED
|STR_SHARED
|STR_NOFREE
));
5070 STR_SET_EMBED_LEN(str
, nlen
);
5071 ptr
= RSTRING(str
)->as
.embed
.ary
;
5072 memmove(ptr
, oldptr
+ len
, nlen
);
5073 if (fl
== STR_NOEMBED
) xfree(oldptr
);
5076 if (!STR_SHARED_P(str
)) {
5077 VALUE shared
= heap_str_make_shared(rb_obj_class(str
), str
);
5078 rb_enc_cr_str_exact_copy(shared
, str
);
5081 ptr
= RSTRING(str
)->as
.heap
.ptr
+= len
;
5082 RSTRING(str
)->as
.heap
.len
= nlen
;
5085 ENC_CODERANGE_CLEAR(str
);
5090 rb_str_splice_0(VALUE str
, long beg
, long len
, VALUE val
)
5093 long slen
, vlen
= RSTRING_LEN(val
);
5096 if (beg
== 0 && vlen
== 0) {
5097 rb_str_drop_bytes(str
, len
);
5101 str_modify_keep_cr(str
);
5102 RSTRING_GETMEM(str
, sptr
, slen
);
5105 RESIZE_CAPA(str
, slen
+ vlen
- len
);
5106 sptr
= RSTRING_PTR(str
);
5109 if (ENC_CODERANGE(str
) == ENC_CODERANGE_7BIT
)
5110 cr
= rb_enc_str_coderange(val
);
5112 cr
= ENC_CODERANGE_UNKNOWN
;
5115 memmove(sptr
+ beg
+ vlen
,
5117 slen
- (beg
+ len
));
5119 if (vlen
< beg
&& len
< 0) {
5120 MEMZERO(sptr
+ slen
, char, -len
);
5123 memmove(sptr
+ beg
, RSTRING_PTR(val
), vlen
);
5126 STR_SET_LEN(str
, slen
);
5127 TERM_FILL(&sptr
[slen
], TERM_LEN(str
));
5128 ENC_CODERANGE_SET(str
, cr
);
5132 rb_str_update(VALUE str
, long beg
, long len
, VALUE val
)
5137 int singlebyte
= single_byte_optimizable(str
);
5140 if (len
< 0) rb_raise(rb_eIndexError
, "negative length %ld", len
);
5143 enc
= rb_enc_check(str
, val
);
5144 slen
= str_strlen(str
, enc
); /* rb_enc_check */
5146 if ((slen
< beg
) || ((beg
< 0) && (beg
+ slen
< 0))) {
5147 rb_raise(rb_eIndexError
, "index %ld out of string", beg
);
5153 assert(beg
<= slen
);
5154 if (len
> slen
- beg
) {
5157 str_modify_keep_cr(str
);
5158 p
= str_nth(RSTRING_PTR(str
), RSTRING_END(str
), beg
, enc
, singlebyte
);
5159 if (!p
) p
= RSTRING_END(str
);
5160 e
= str_nth(p
, RSTRING_END(str
), len
, enc
, singlebyte
);
5161 if (!e
) e
= RSTRING_END(str
);
5163 beg
= p
- RSTRING_PTR(str
); /* physical position */
5164 len
= e
- p
; /* physical length */
5165 rb_str_splice_0(str
, beg
, len
, val
);
5166 rb_enc_associate(str
, enc
);
5167 cr
= ENC_CODERANGE_AND(ENC_CODERANGE(str
), ENC_CODERANGE(val
));
5168 if (cr
!= ENC_CODERANGE_BROKEN
)
5169 ENC_CODERANGE_SET(str
, cr
);
5172 #define rb_str_splice(str, beg, len, val) rb_str_update(str, beg, len, val)
5175 rb_str_subpat_set(VALUE str
, VALUE re
, VALUE backref
, VALUE val
)
5179 long start
, end
, len
;
5181 struct re_registers
*regs
;
5183 if (rb_reg_search(re
, str
, 0, 0) < 0) {
5184 rb_raise(rb_eIndexError
, "regexp not matched");
5186 match
= rb_backref_get();
5187 nth
= rb_reg_backref_number(match
, backref
);
5188 regs
= RMATCH_REGS(match
);
5189 if ((nth
>= regs
->num_regs
) || ((nth
< 0) && (-nth
>= regs
->num_regs
))) {
5190 rb_raise(rb_eIndexError
, "index %d out of regexp", nth
);
5193 nth
+= regs
->num_regs
;
5198 rb_raise(rb_eIndexError
, "regexp group %d not matched", nth
);
5203 enc
= rb_enc_check_str(str
, val
);
5204 rb_str_splice_0(str
, start
, len
, val
);
5205 rb_enc_associate(str
, enc
);
5209 rb_str_aset(VALUE str
, VALUE indx
, VALUE val
)
5213 switch (TYPE(indx
)) {
5215 rb_str_subpat_set(str
, indx
, INT2FIX(0), val
);
5219 beg
= rb_str_index(str
, indx
, 0);
5221 rb_raise(rb_eIndexError
, "string not matched");
5223 beg
= rb_str_sublen(str
, beg
);
5224 rb_str_splice(str
, beg
, str_strlen(indx
, NULL
), val
);
5228 /* check if indx is Range */
5231 if (rb_range_beg_len(indx
, &beg
, &len
, str_strlen(str
, NULL
), 2)) {
5232 rb_str_splice(str
, beg
, len
, val
);
5239 idx
= NUM2LONG(indx
);
5240 rb_str_splice(str
, idx
, 1, val
);
5247 * str[integer] = new_str
5248 * str[integer, integer] = new_str
5249 * str[range] = aString
5250 * str[regexp] = new_str
5251 * str[regexp, integer] = new_str
5252 * str[regexp, name] = new_str
5253 * str[other_str] = new_str
5255 * Element Assignment---Replaces some or all of the content of
5256 * <i>str</i>. The portion of the string affected is determined using
5257 * the same criteria as String#[]. If the replacement string is not
5258 * the same length as the text it is replacing, the string will be
5259 * adjusted accordingly. If the regular expression or string is used
5260 * as the index doesn't match a position in the string, IndexError is
5261 * raised. If the regular expression form is used, the optional
5262 * second Integer allows you to specify which portion of the match to
5263 * replace (effectively using the MatchData indexing rules. The forms
5264 * that take an Integer will raise an IndexError if the value is out
5265 * of range; the Range form will raise a RangeError, and the Regexp
5266 * and String will raise an IndexError on negative match.
5270 rb_str_aset_m(int argc
, VALUE
*argv
, VALUE str
)
5273 if (RB_TYPE_P(argv
[0], T_REGEXP
)) {
5274 rb_str_subpat_set(str
, argv
[0], argv
[1], argv
[2]);
5277 rb_str_splice(str
, NUM2LONG(argv
[0]), NUM2LONG(argv
[1]), argv
[2]);
5281 rb_check_arity(argc
, 2, 3);
5282 return rb_str_aset(str
, argv
[0], argv
[1]);
5287 * insert(index, other_string) -> self
5289 * Inserts the given +other_string+ into +self+; returns +self+.
5291 * If the \Integer +index+ is positive, inserts +other_string+ at offset +index+:
5293 * 'foo'.insert(1, 'bar') # => "fbaroo"
5295 * If the \Integer +index+ is negative, counts backward from the end of +self+
5296 * and inserts +other_string+ at offset <tt>index+1</tt>
5297 * (that is, _after_ <tt>self[index]</tt>):
5299 * 'foo'.insert(-2, 'bar') # => "fobaro"
5304 rb_str_insert(VALUE str
, VALUE idx
, VALUE str2
)
5306 long pos
= NUM2LONG(idx
);
5309 return rb_str_append(str
, str2
);
5314 rb_str_splice(str
, pos
, 0, str2
);
5321 * slice!(index) -> new_string or nil
5322 * slice!(start, length) -> new_string or nil
5323 * slice!(range) -> new_string or nil
5324 * slice!(regexp, capture = 0) -> new_string or nil
5325 * slice!(substring) -> new_string or nil
5327 * Removes the substring of +self+ specified by the arguments;
5328 * returns the removed substring.
5330 * See String#[] for details about the arguments that specify the substring.
5334 * string = "This is a string"
5335 * string.slice!(2) #=> "i"
5336 * string.slice!(3..6) #=> " is "
5337 * string.slice!(/s.*t/) #=> "sa st"
5338 * string.slice!("r") #=> "r"
5339 * string #=> "Thing"
5344 rb_str_slice_bang(int argc
, VALUE
*argv
, VALUE str
)
5346 VALUE result
= Qnil
;
5351 rb_check_arity(argc
, 1, 2);
5352 str_modify_keep_cr(str
);
5354 if (RB_TYPE_P(indx
, T_REGEXP
)) {
5355 if (rb_reg_search(indx
, str
, 0, 0) < 0) return Qnil
;
5356 VALUE match
= rb_backref_get();
5357 struct re_registers
*regs
= RMATCH_REGS(match
);
5359 if (argc
> 1 && (nth
= rb_reg_backref_number(match
, argv
[1])) < 0) {
5360 if ((nth
+= regs
->num_regs
) <= 0) return Qnil
;
5362 else if (nth
>= regs
->num_regs
) return Qnil
;
5364 len
= END(nth
) - beg
;
5367 else if (argc
== 2) {
5368 beg
= NUM2LONG(indx
);
5369 len
= NUM2LONG(argv
[1]);
5372 else if (FIXNUM_P(indx
)) {
5373 beg
= FIX2LONG(indx
);
5374 if (!(p
= rb_str_subpos(str
, beg
, &len
))) return Qnil
;
5375 if (!len
) return Qnil
;
5376 beg
= p
- RSTRING_PTR(str
);
5379 else if (RB_TYPE_P(indx
, T_STRING
)) {
5380 beg
= rb_str_index(str
, indx
, 0);
5381 if (beg
== -1) return Qnil
;
5382 len
= RSTRING_LEN(indx
);
5383 result
= str_duplicate(rb_cString
, indx
);
5387 switch (rb_range_beg_len(indx
, &beg
, &len
, str_strlen(str
, NULL
), 0)) {
5391 beg
= NUM2LONG(indx
);
5392 if (!(p
= rb_str_subpos(str
, beg
, &len
))) return Qnil
;
5393 if (!len
) return Qnil
;
5394 beg
= p
- RSTRING_PTR(str
);
5402 if (!(p
= rb_str_subpos(str
, beg
, &len
))) return Qnil
;
5403 beg
= p
- RSTRING_PTR(str
);
5406 result
= rb_str_new(RSTRING_PTR(str
)+beg
, len
);
5407 rb_enc_cr_str_copy_for_substr(result
, str
);
5412 rb_str_drop_bytes(str
, len
);
5415 char *sptr
= RSTRING_PTR(str
);
5416 long slen
= RSTRING_LEN(str
);
5417 if (beg
+ len
> slen
) /* pathological check */
5421 slen
- (beg
+ len
));
5423 STR_SET_LEN(str
, slen
);
5424 TERM_FILL(&sptr
[slen
], TERM_LEN(str
));
5435 switch (OBJ_BUILTIN_TYPE(pat
)) {
5443 val
= rb_check_string_type(pat
);
5445 Check_Type(pat
, T_REGEXP
);
5450 return rb_reg_regcomp(pat
);
5454 get_pat_quoted(VALUE pat
, int check
)
5458 switch (OBJ_BUILTIN_TYPE(pat
)) {
5466 val
= rb_check_string_type(pat
);
5468 Check_Type(pat
, T_REGEXP
);
5472 if (check
&& is_broken_string(pat
)) {
5473 rb_exc_raise(rb_reg_check_preprocess(pat
));
5479 rb_pat_search(VALUE pat
, VALUE str
, long pos
, int set_backref_str
)
5481 if (BUILTIN_TYPE(pat
) == T_STRING
) {
5482 pos
= rb_strseq_index(str
, pat
, pos
, 1);
5483 if (set_backref_str
) {
5485 str
= rb_str_new_frozen_String(str
);
5486 rb_backref_set_string(str
, pos
, RSTRING_LEN(pat
));
5489 rb_backref_set(Qnil
);
5495 return rb_reg_search0(pat
, str
, pos
, 0, set_backref_str
);
5502 * sub!(pattern, replacement) -> self or nil
5503 * sub!(pattern) {|match| ... } -> self or nil
5505 * Returns +self+ with only the first occurrence
5506 * (not all occurrences) of the given +pattern+ replaced.
5508 * See {Substitution Methods}[#class-String-label-Substitution+Methods].
5510 * Related: String#sub, String#gsub, String#gsub!.
5515 rb_str_sub_bang(int argc
, VALUE
*argv
, VALUE str
)
5517 VALUE pat
, repl
, hash
= Qnil
;
5520 int min_arity
= rb_block_given_p() ? 1 : 2;
5523 rb_check_arity(argc
, min_arity
, 2);
5529 hash
= rb_check_hash_type(argv
[1]);
5535 pat
= get_pat_quoted(argv
[0], 1);
5537 str_modifiable(str
);
5538 beg
= rb_pat_search(pat
, str
, 0, 1);
5541 int cr
= ENC_CODERANGE(str
);
5543 VALUE match
, match0
= Qnil
;
5544 struct re_registers
*regs
;
5548 match
= rb_backref_get();
5549 regs
= RMATCH_REGS(match
);
5550 if (RB_TYPE_P(pat
, T_STRING
)) {
5552 end0
= beg0
+ RSTRING_LEN(pat
);
5558 if (iter
) match0
= rb_reg_nth_match(0, match
);
5561 if (iter
|| !NIL_P(hash
)) {
5562 p
= RSTRING_PTR(str
); len
= RSTRING_LEN(str
);
5565 repl
= rb_obj_as_string(rb_yield(match0
));
5568 repl
= rb_hash_aref(hash
, rb_str_subseq(str
, beg0
, end0
- beg0
));
5569 repl
= rb_obj_as_string(repl
);
5571 str_mod_check(str
, p
, len
);
5572 rb_check_frozen(str
);
5575 repl
= rb_reg_regsub(repl
, str
, regs
, RB_TYPE_P(pat
, T_STRING
) ? Qnil
: pat
);
5578 enc
= rb_enc_compatible(str
, repl
);
5580 rb_encoding
*str_enc
= STR_ENC_GET(str
);
5581 p
= RSTRING_PTR(str
); len
= RSTRING_LEN(str
);
5582 if (coderange_scan(p
, beg0
, str_enc
) != ENC_CODERANGE_7BIT
||
5583 coderange_scan(p
+end0
, len
-end0
, str_enc
) != ENC_CODERANGE_7BIT
) {
5584 rb_raise(rb_eEncCompatError
, "incompatible character encodings: %s and %s",
5585 rb_enc_name(str_enc
),
5586 rb_enc_name(STR_ENC_GET(repl
)));
5588 enc
= STR_ENC_GET(repl
);
5591 rb_enc_associate(str
, enc
);
5592 if (ENC_CODERANGE_UNKNOWN
< cr
&& cr
< ENC_CODERANGE_BROKEN
) {
5593 int cr2
= ENC_CODERANGE(repl
);
5594 if (cr2
== ENC_CODERANGE_BROKEN
||
5595 (cr
== ENC_CODERANGE_VALID
&& cr2
== ENC_CODERANGE_7BIT
))
5596 cr
= ENC_CODERANGE_UNKNOWN
;
5601 rlen
= RSTRING_LEN(repl
);
5602 len
= RSTRING_LEN(str
);
5604 RESIZE_CAPA(str
, len
+ rlen
- plen
);
5606 p
= RSTRING_PTR(str
);
5608 memmove(p
+ beg0
+ rlen
, p
+ beg0
+ plen
, len
- beg0
- plen
);
5610 rp
= RSTRING_PTR(repl
);
5611 memmove(p
+ beg0
, rp
, rlen
);
5613 STR_SET_LEN(str
, len
);
5614 TERM_FILL(&RSTRING_PTR(str
)[len
], TERM_LEN(str
));
5615 ENC_CODERANGE_SET(str
, cr
);
5625 * sub(pattern, replacement) -> new_string
5626 * sub(pattern) {|match| ... } -> new_string
5628 * Returns a copy of +self+ with only the first occurrence
5629 * (not all occurrences) of the given +pattern+ replaced.
5631 * See {Substitution Methods}[#class-String-label-Substitution+Methods].
5633 * Related: String#sub!, String#gsub, String#gsub!.
5638 rb_str_sub(int argc
, VALUE
*argv
, VALUE str
)
5640 str
= str_duplicate(rb_cString
, str
);
5641 rb_str_sub_bang(argc
, argv
, str
);
5646 str_gsub(int argc
, VALUE
*argv
, VALUE str
, int bang
)
5648 VALUE pat
, val
= Qnil
, repl
, match
, match0
= Qnil
, dest
, hash
= Qnil
;
5649 struct re_registers
*regs
;
5650 long beg
, beg0
, end0
;
5651 long offset
, blen
, slen
, len
, last
;
5652 enum {STR
, ITER
, MAP
} mode
= STR
;
5654 int need_backref
= -1;
5655 rb_encoding
*str_enc
;
5659 RETURN_ENUMERATOR(str
, argc
, argv
);
5664 hash
= rb_check_hash_type(argv
[1]);
5673 rb_error_arity(argc
, 1, 2);
5676 pat
= get_pat_quoted(argv
[0], 1);
5677 beg
= rb_pat_search(pat
, str
, 0, need_backref
);
5679 if (bang
) return Qnil
; /* no match, no substitution */
5680 return str_duplicate(rb_cString
, str
);
5684 blen
= RSTRING_LEN(str
) + 30; /* len + margin */
5685 dest
= rb_str_buf_new(blen
);
5686 sp
= RSTRING_PTR(str
);
5687 slen
= RSTRING_LEN(str
);
5689 str_enc
= STR_ENC_GET(str
);
5690 rb_enc_associate(dest
, str_enc
);
5691 ENC_CODERANGE_SET(dest
, rb_enc_asciicompat(str_enc
) ? ENC_CODERANGE_7BIT
: ENC_CODERANGE_VALID
);
5694 match
= rb_backref_get();
5695 regs
= RMATCH_REGS(match
);
5696 if (RB_TYPE_P(pat
, T_STRING
)) {
5698 end0
= beg0
+ RSTRING_LEN(pat
);
5704 if (mode
== ITER
) match0
= rb_reg_nth_match(0, match
);
5709 val
= rb_obj_as_string(rb_yield(match0
));
5712 val
= rb_hash_aref(hash
, rb_str_subseq(str
, beg0
, end0
- beg0
));
5713 val
= rb_obj_as_string(val
);
5715 str_mod_check(str
, sp
, slen
);
5716 if (val
== dest
) { /* paranoid check [ruby-dev:24827] */
5717 rb_raise(rb_eRuntimeError
, "block should not cheat");
5720 else if (need_backref
) {
5721 val
= rb_reg_regsub(repl
, str
, regs
, RB_TYPE_P(pat
, T_STRING
) ? Qnil
: pat
);
5722 if (need_backref
< 0) {
5723 need_backref
= val
!= repl
;
5730 len
= beg0
- offset
; /* copy pre-match substr */
5732 rb_enc_str_buf_cat(dest
, cp
, len
, str_enc
);
5735 rb_str_buf_append(dest
, val
);
5741 * Always consume at least one character of the input string
5742 * in order to prevent infinite loops.
5744 if (RSTRING_LEN(str
) <= end0
) break;
5745 len
= rb_enc_fast_mbclen(RSTRING_PTR(str
)+end0
, RSTRING_END(str
), str_enc
);
5746 rb_enc_str_buf_cat(dest
, RSTRING_PTR(str
)+end0
, len
, str_enc
);
5747 offset
= end0
+ len
;
5749 cp
= RSTRING_PTR(str
) + offset
;
5750 if (offset
> RSTRING_LEN(str
)) break;
5751 beg
= rb_pat_search(pat
, str
, offset
, need_backref
);
5753 if (RSTRING_LEN(str
) > offset
) {
5754 rb_enc_str_buf_cat(dest
, cp
, RSTRING_LEN(str
) - offset
, str_enc
);
5756 rb_pat_search(pat
, str
, last
, 1);
5758 str_shared_replace(str
, dest
);
5770 * gsub!(pattern, replacement) -> self or nil
5771 * gsub!(pattern) {|match| ... } -> self or nil
5772 * gsub!(pattern) -> an_enumerator
5774 * Performs the specified substring replacement(s) on +self+;
5775 * returns +self+ if any replacement occurred, +nil+ otherwise.
5777 * See {Substitution Methods}[#class-String-label-Substitution+Methods].
5779 * Returns an Enumerator if no +replacement+ and no block given.
5781 * Related: String#sub, String#gsub, String#sub!.
5786 rb_str_gsub_bang(int argc
, VALUE
*argv
, VALUE str
)
5788 str_modify_keep_cr(str
);
5789 return str_gsub(argc
, argv
, str
, 1);
5795 * gsub(pattern, replacement) -> new_string
5796 * gsub(pattern) {|match| ... } -> new_string
5797 * gsub(pattern) -> enumerator
5799 * Returns a copy of +self+ with all occurrences of the given +pattern+ replaced.
5801 * See {Substitution Methods}[#class-String-label-Substitution+Methods].
5803 * Returns an Enumerator if no +replacement+ and no block given.
5805 * Related: String#sub, String#sub!, String#gsub!.
5810 rb_str_gsub(int argc
, VALUE
*argv
, VALUE str
)
5812 return str_gsub(argc
, argv
, str
, 0);
5818 * replace(other_string) -> self
5820 * Replaces the contents of +self+ with the contents of +other_string+:
5822 * s = 'foo' # => "foo"
5823 * s.replace('bar') # => "bar"
5828 rb_str_replace(VALUE str
, VALUE str2
)
5830 str_modifiable(str
);
5831 if (str
== str2
) return str
;
5835 return str_replace(str
, str2
);
5842 * Removes the contents of +self+:
5844 * s = 'foo' # => "foo"
5850 rb_str_clear(VALUE str
)
5854 STR_SET_EMBED_LEN(str
, 0);
5855 RSTRING_PTR(str
)[0] = 0;
5856 if (rb_enc_asciicompat(STR_ENC_GET(str
)))
5857 ENC_CODERANGE_SET(str
, ENC_CODERANGE_7BIT
);
5859 ENC_CODERANGE_SET(str
, ENC_CODERANGE_VALID
);
5867 * Returns a string containing the first character of +self+:
5869 * s = 'foo' # => "foo"
5875 rb_str_chr(VALUE str
)
5877 return rb_str_substr(str
, 0, 1);
5882 * getbyte(index) -> integer
5884 * Returns the byte at zero-based +index+ as an integer:
5886 * s = 'abcde' # => "abcde"
5887 * s.getbyte(0) # => 97
5888 * s.getbyte(1) # => 98
5890 * Related: String#setbyte.
5893 rb_str_getbyte(VALUE str
, VALUE index
)
5895 long pos
= NUM2LONG(index
);
5898 pos
+= RSTRING_LEN(str
);
5899 if (pos
< 0 || RSTRING_LEN(str
) <= pos
)
5902 return INT2FIX((unsigned char)RSTRING_PTR(str
)[pos
]);
5907 * setbyte(index, integer) -> integer
5909 * Sets the byte at zero-based +index+ to +integer+; returns +integer+:
5911 * s = 'abcde' # => "abcde"
5912 * s.setbyte(0, 98) # => 98
5915 * Related: String#getbyte.
5918 rb_str_setbyte(VALUE str
, VALUE index
, VALUE value
)
5920 long pos
= NUM2LONG(index
);
5921 long len
= RSTRING_LEN(str
);
5922 char *ptr
, *head
, *left
= 0;
5924 int cr
= ENC_CODERANGE_UNKNOWN
, width
, nlen
;
5926 if (pos
< -len
|| len
<= pos
)
5927 rb_raise(rb_eIndexError
, "index %ld out of string", pos
);
5931 VALUE v
= rb_to_int(value
);
5932 VALUE w
= rb_int_and(v
, INT2FIX(0xff));
5933 char byte
= (char)(NUM2INT(w
) & 0xFF);
5935 if (!str_independent(str
))
5936 str_make_independent(str
);
5937 enc
= STR_ENC_GET(str
);
5938 head
= RSTRING_PTR(str
);
5940 if (!STR_EMBED_P(str
)) {
5941 cr
= ENC_CODERANGE(str
);
5943 case ENC_CODERANGE_7BIT
:
5946 if (ISASCII(byte
)) goto end
;
5947 nlen
= rb_enc_precise_mbclen(left
, head
+len
, enc
);
5948 if (!MBCLEN_CHARFOUND_P(nlen
))
5949 ENC_CODERANGE_SET(str
, ENC_CODERANGE_BROKEN
);
5951 ENC_CODERANGE_SET(str
, ENC_CODERANGE_VALID
);
5953 case ENC_CODERANGE_VALID
:
5954 left
= rb_enc_left_char_head(head
, ptr
, head
+len
, enc
);
5955 width
= rb_enc_precise_mbclen(left
, head
+len
, enc
);
5957 nlen
= rb_enc_precise_mbclen(left
, head
+len
, enc
);
5958 if (!MBCLEN_CHARFOUND_P(nlen
))
5959 ENC_CODERANGE_SET(str
, ENC_CODERANGE_BROKEN
);
5960 else if (MBCLEN_CHARFOUND_LEN(nlen
) != width
|| ISASCII(byte
))
5961 ENC_CODERANGE_CLEAR(str
);
5965 ENC_CODERANGE_CLEAR(str
);
5973 str_byte_substr(VALUE str
, long beg
, long len
, int empty
)
5975 char *p
, *s
= RSTRING_PTR(str
);
5976 long n
= RSTRING_LEN(str
);
5979 if (beg
> n
|| len
< 0) return Qnil
;
5982 if (beg
< 0) return Qnil
;
5987 if (!empty
) return Qnil
;
5994 if (!STR_EMBEDDABLE_P(len
, TERM_LEN(str
)) && SHARABLE_SUBSTRING_P(beg
, len
, n
)) {
5995 str2
= rb_str_new_frozen(str
);
5996 str2
= str_new_shared(rb_cString
, str2
);
5997 RSTRING(str2
)->as
.heap
.ptr
+= beg
;
5998 RSTRING(str2
)->as
.heap
.len
= len
;
6001 str2
= rb_str_new(p
, len
);
6004 str_enc_copy(str2
, str
);
6006 if (RSTRING_LEN(str2
) == 0) {
6007 if (!rb_enc_asciicompat(STR_ENC_GET(str
)))
6008 ENC_CODERANGE_SET(str2
, ENC_CODERANGE_VALID
);
6010 ENC_CODERANGE_SET(str2
, ENC_CODERANGE_7BIT
);
6013 switch (ENC_CODERANGE(str
)) {
6014 case ENC_CODERANGE_7BIT
:
6015 ENC_CODERANGE_SET(str2
, ENC_CODERANGE_7BIT
);
6018 ENC_CODERANGE_SET(str2
, ENC_CODERANGE_UNKNOWN
);
6027 str_byte_aref(VALUE str
, VALUE indx
)
6030 if (FIXNUM_P(indx
)) {
6031 idx
= FIX2LONG(indx
);
6034 /* check if indx is Range */
6035 long beg
, len
= RSTRING_LEN(str
);
6037 switch (rb_range_beg_len(indx
, &beg
, &len
, len
, 0)) {
6043 return str_byte_substr(str
, beg
, len
, TRUE
);
6046 idx
= NUM2LONG(indx
);
6048 return str_byte_substr(str
, idx
, 1, FALSE
);
6053 * byteslice(index, length = 1) -> string or nil
6054 * byteslice(range) -> string or nil
6056 * Returns a substring of +self+, or +nil+ if the substring cannot be constructed.
6058 * With integer arguments +index+ and +length+ given,
6059 * returns the substring beginning at the given +index+
6060 * of the given +length+ (if possible),
6061 * or +nil+ if +length+ is negative or +index+ falls outside of +self+:
6063 * s = '0123456789' # => "0123456789"
6064 * s.byteslice(2) # => "2"
6065 * s.byteslice(200) # => nil
6066 * s.byteslice(4, 3) # => "456"
6067 * s.byteslice(4, 30) # => "456789"
6068 * s.byteslice(4, -1) # => nil
6069 * s.byteslice(40, 2) # => nil
6071 * In either case above, counts backwards from the end of +self+
6072 * if +index+ is negative:
6074 * s = '0123456789' # => "0123456789"
6075 * s.byteslice(-4) # => "6"
6076 * s.byteslice(-4, 3) # => "678"
6078 * With Range argument +range+ given, returns
6079 * <tt>byteslice(range.begin, range.size)</tt>:
6081 * s = '0123456789' # => "0123456789"
6082 * s.byteslice(4..6) # => "456"
6083 * s.byteslice(-6..-4) # => "456"
6084 * s.byteslice(5..2) # => "" # range.size is zero.
6085 * s.byteslice(40..42) # => nil
6087 * In all cases, a returned string has the same encoding as +self+:
6089 * s.encoding # => #<Encoding:UTF-8>
6090 * s.byteslice(4).encoding # => #<Encoding:UTF-8>
6095 rb_str_byteslice(int argc
, VALUE
*argv
, VALUE str
)
6098 long beg
= NUM2LONG(argv
[0]);
6099 long end
= NUM2LONG(argv
[1]);
6100 return str_byte_substr(str
, beg
, end
, TRUE
);
6102 rb_check_arity(argc
, 1, 2);
6103 return str_byte_aref(str
, argv
[0]);
6110 * Returns a new string with the characters from +self+ in reverse order.
6112 * 'stressed'.reverse # => "desserts"
6117 rb_str_reverse(VALUE str
)
6124 if (RSTRING_LEN(str
) <= 1) return str_duplicate(rb_cString
, str
);
6125 enc
= STR_ENC_GET(str
);
6126 rev
= rb_str_new(0, RSTRING_LEN(str
));
6127 s
= RSTRING_PTR(str
); e
= RSTRING_END(str
);
6128 p
= RSTRING_END(rev
);
6129 cr
= ENC_CODERANGE(str
);
6131 if (RSTRING_LEN(str
) > 1) {
6132 if (single_byte_optimizable(str
)) {
6137 else if (cr
== ENC_CODERANGE_VALID
) {
6139 int clen
= rb_enc_fast_mbclen(s
, e
, enc
);
6147 cr
= rb_enc_asciicompat(enc
) ?
6148 ENC_CODERANGE_7BIT
: ENC_CODERANGE_VALID
;
6150 int clen
= rb_enc_mbclen(s
, e
, enc
);
6152 if (clen
> 1 || (*s
& 0x80)) cr
= ENC_CODERANGE_UNKNOWN
;
6159 STR_SET_LEN(rev
, RSTRING_LEN(str
));
6160 str_enc_copy(rev
, str
);
6161 ENC_CODERANGE_SET(rev
, cr
);
6171 * Returns +self+ with its characters reversed:
6174 * s.reverse! # => "desserts"
6180 rb_str_reverse_bang(VALUE str
)
6182 if (RSTRING_LEN(str
) > 1) {
6183 if (single_byte_optimizable(str
)) {
6186 str_modify_keep_cr(str
);
6187 s
= RSTRING_PTR(str
);
6188 e
= RSTRING_END(str
) - 1;
6196 str_shared_replace(str
, rb_str_reverse(str
));
6200 str_modify_keep_cr(str
);
6208 * include? other_string -> true or false
6210 * Returns +true+ if +self+ contains +other_string+, +false+ otherwise:
6213 * s.include?('f') # => true
6214 * s.include?('fo') # => true
6215 * s.include?('food') # => false
6220 rb_str_include(VALUE str
, VALUE arg
)
6225 i
= rb_str_index(str
, arg
, 0);
6227 return RBOOL(i
!= -1);
6233 * to_i(base = 10) -> integer
6235 * Returns the result of interpreting leading characters in +self+
6236 * as an integer in the given +base+ (which must be in (2..36)):
6238 * '123456'.to_i # => 123456
6239 * '123def'.to_i(16) # => 1195503
6241 * Characters past a leading valid number (in the given +base+) are ignored:
6243 * '12.345'.to_i # => 12
6244 * '12345'.to_i(2) # => 1
6246 * Returns zero if there is no leading valid number:
6248 * 'abcdef'.to_i # => 0
6249 * '2'.to_i(2) # => 0
6254 rb_str_to_i(int argc
, VALUE
*argv
, VALUE str
)
6258 if (rb_check_arity(argc
, 0, 1) && (base
= NUM2INT(argv
[0])) < 0) {
6259 rb_raise(rb_eArgError
, "invalid radix %d", base
);
6261 return rb_str_to_inum(str
, base
, FALSE
);
6269 * Returns the result of interpreting leading characters in +self+ as a Float:
6271 * '3.14159'.to_f # => 3.14159
6272 '1.234e-2'.to_f # => 0.01234
6274 * Characters past a leading valid number (in the given +base+) are ignored:
6276 * '3.14 (pi to two places)'.to_f # => 3.14
6278 * Returns zero if there is no leading valid number:
6280 * 'abcdef'.to_f # => 0.0
6285 rb_str_to_f(VALUE str
)
6287 return DBL2NUM(rb_str_to_dbl(str
, FALSE
));
6293 * to_s -> self or string
6295 * Returns +self+ if +self+ is a \String,
6296 * or +self+ converted to a \String if +self+ is a subclass of \String.
6298 * String#to_str is an alias for String#to_s.
6303 rb_str_to_s(VALUE str
)
6305 if (rb_obj_class(str
) != rb_cString
) {
6306 return str_duplicate(rb_cString
, str
);
6313 str_cat_char(VALUE str
, unsigned int c
, rb_encoding
*enc
)
6315 char s
[RUBY_MAX_CHAR_LEN
];
6316 int n
= rb_enc_codelen(c
, enc
);
6318 rb_enc_mbcput(c
, s
, enc
);
6319 rb_enc_str_buf_cat(str
, s
, n
, enc
);
6323 #define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */
6326 rb_str_buf_cat_escaped_char(VALUE result
, unsigned int c
, int unicode_p
)
6328 char buf
[CHAR_ESC_LEN
+ 1];
6335 if (c
< 0x7F && ISPRINT(c
)) {
6336 snprintf(buf
, CHAR_ESC_LEN
, "%c", c
);
6338 else if (c
< 0x10000) {
6339 snprintf(buf
, CHAR_ESC_LEN
, "\\u%04X", c
);
6342 snprintf(buf
, CHAR_ESC_LEN
, "\\u{%X}", c
);
6347 snprintf(buf
, CHAR_ESC_LEN
, "\\x%02X", c
);
6350 snprintf(buf
, CHAR_ESC_LEN
, "\\x{%X}", c
);
6353 l
= (int)strlen(buf
); /* CHAR_ESC_LEN cannot exceed INT_MAX */
6354 rb_str_buf_cat(result
, buf
, l
);
6359 ruby_escaped_char(int c
)
6362 case '\0': return "\\0";
6363 case '\n': return "\\n";
6364 case '\r': return "\\r";
6365 case '\t': return "\\t";
6366 case '\f': return "\\f";
6367 case '\013': return "\\v";
6368 case '\010': return "\\b";
6369 case '\007': return "\\a";
6370 case '\033': return "\\e";
6371 case '\x7f': return "\\c?";
6377 rb_str_escape(VALUE str
)
6379 int encidx
= ENCODING_GET(str
);
6380 rb_encoding
*enc
= rb_enc_from_index(encidx
);
6381 const char *p
= RSTRING_PTR(str
);
6382 const char *pend
= RSTRING_END(str
);
6383 const char *prev
= p
;
6384 char buf
[CHAR_ESC_LEN
+ 1];
6385 VALUE result
= rb_str_buf_new(0);
6386 int unicode_p
= rb_enc_unicode_p(enc
);
6387 int asciicompat
= rb_enc_asciicompat(enc
);
6392 int n
= rb_enc_precise_mbclen(p
, pend
, enc
);
6393 if (!MBCLEN_CHARFOUND_P(n
)) {
6394 if (p
> prev
) str_buf_cat(result
, prev
, p
- prev
);
6395 n
= rb_enc_mbminlen(enc
);
6397 n
= (int)(pend
- p
);
6399 snprintf(buf
, CHAR_ESC_LEN
, "\\x%02X", *p
& 0377);
6400 str_buf_cat(result
, buf
, strlen(buf
));
6405 n
= MBCLEN_CHARFOUND_LEN(n
);
6406 c
= rb_enc_mbc_to_codepoint(p
, pend
, enc
);
6408 cc
= ruby_escaped_char(c
);
6410 if (p
- n
> prev
) str_buf_cat(result
, prev
, p
- n
- prev
);
6411 str_buf_cat(result
, cc
, strlen(cc
));
6414 else if (asciicompat
&& rb_enc_isascii(c
, enc
) && ISPRINT(c
)) {
6417 if (p
- n
> prev
) str_buf_cat(result
, prev
, p
- n
- prev
);
6418 rb_str_buf_cat_escaped_char(result
, c
, unicode_p
);
6422 if (p
> prev
) str_buf_cat(result
, prev
, p
- prev
);
6423 ENCODING_CODERANGE_SET(result
, rb_usascii_encindex(), ENC_CODERANGE_7BIT
);
6432 * Returns a printable version of +self+, enclosed in double-quotes,
6433 * and with special characters escaped:
6435 * s = "foo\tbar\tbaz\n"
6436 * # => "foo\tbar\tbaz\n"
6438 * # => "\"foo\\tbar\\tbaz\\n\""
6443 rb_str_inspect(VALUE str
)
6445 int encidx
= ENCODING_GET(str
);
6446 rb_encoding
*enc
= rb_enc_from_index(encidx
), *actenc
;
6447 const char *p
, *pend
, *prev
;
6448 char buf
[CHAR_ESC_LEN
+ 1];
6449 VALUE result
= rb_str_buf_new(0);
6450 rb_encoding
*resenc
= rb_default_internal_encoding();
6451 int unicode_p
= rb_enc_unicode_p(enc
);
6452 int asciicompat
= rb_enc_asciicompat(enc
);
6454 if (resenc
== NULL
) resenc
= rb_default_external_encoding();
6455 if (!rb_enc_asciicompat(resenc
)) resenc
= rb_usascii_encoding();
6456 rb_enc_associate(result
, resenc
);
6457 str_buf_cat2(result
, "\"");
6459 p
= RSTRING_PTR(str
); pend
= RSTRING_END(str
);
6461 actenc
= get_actual_encoding(encidx
, str
);
6462 if (actenc
!= enc
) {
6464 if (unicode_p
) unicode_p
= rb_enc_unicode_p(enc
);
6470 n
= rb_enc_precise_mbclen(p
, pend
, enc
);
6471 if (!MBCLEN_CHARFOUND_P(n
)) {
6472 if (p
> prev
) str_buf_cat(result
, prev
, p
- prev
);
6473 n
= rb_enc_mbminlen(enc
);
6475 n
= (int)(pend
- p
);
6477 snprintf(buf
, CHAR_ESC_LEN
, "\\x%02X", *p
& 0377);
6478 str_buf_cat(result
, buf
, strlen(buf
));
6483 n
= MBCLEN_CHARFOUND_LEN(n
);
6484 c
= rb_enc_mbc_to_codepoint(p
, pend
, enc
);
6486 if ((asciicompat
|| unicode_p
) &&
6487 (c
== '"'|| c
== '\\' ||
6490 MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p
,pend
,enc
)) &&
6491 (cc
= rb_enc_codepoint(p
,pend
,enc
),
6492 (cc
== '$' || cc
== '@' || cc
== '{'))))) {
6493 if (p
- n
> prev
) str_buf_cat(result
, prev
, p
- n
- prev
);
6494 str_buf_cat2(result
, "\\");
6495 if (asciicompat
|| enc
== resenc
) {
6501 case '\n': cc
= 'n'; break;
6502 case '\r': cc
= 'r'; break;
6503 case '\t': cc
= 't'; break;
6504 case '\f': cc
= 'f'; break;
6505 case '\013': cc
= 'v'; break;
6506 case '\010': cc
= 'b'; break;
6507 case '\007': cc
= 'a'; break;
6508 case 033: cc
= 'e'; break;
6509 default: cc
= 0; break;
6512 if (p
- n
> prev
) str_buf_cat(result
, prev
, p
- n
- prev
);
6515 str_buf_cat(result
, buf
, 2);
6519 if ((enc
== resenc
&& rb_enc_isprint(c
, enc
)) ||
6520 (asciicompat
&& rb_enc_isascii(c
, enc
) && ISPRINT(c
))) {
6524 if (p
- n
> prev
) str_buf_cat(result
, prev
, p
- n
- prev
);
6525 rb_str_buf_cat_escaped_char(result
, c
, unicode_p
);
6530 if (p
> prev
) str_buf_cat(result
, prev
, p
- prev
);
6531 str_buf_cat2(result
, "\"");
6536 #define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
6542 * Returns a printable version of +self+, enclosed in double-quotes,
6543 * with special characters escaped, and with non-printing characters
6544 * replaced by hexadecimal notation:
6546 * "hello \n ''".dump # => "\"hello \\n ''\""
6547 * "\f\x00\xff\\\"".dump # => "\"\\f\\x00\\xFF\\\\\\\"\""
6549 * Related: String#undump (inverse of String#dump).
6554 rb_str_dump(VALUE str
)
6556 int encidx
= rb_enc_get_index(str
);
6557 rb_encoding
*enc
= rb_enc_from_index(encidx
);
6559 const char *p
, *pend
;
6562 int u8
= (encidx
== rb_utf8_encindex());
6563 static const char nonascii_suffix
[] = ".dup.force_encoding(\"%s\")";
6566 if (!rb_enc_asciicompat(enc
)) {
6567 len
+= strlen(nonascii_suffix
) - rb_strlen_lit("%s");
6568 len
+= strlen(enc
->name
);
6571 p
= RSTRING_PTR(str
); pend
= p
+ RSTRING_LEN(str
);
6574 unsigned char c
= *p
++;
6577 case '"': case '\\':
6578 case '\n': case '\r':
6579 case '\t': case '\f':
6580 case '\013': case '\010': case '\007': case '\033':
6585 clen
= IS_EVSTR(p
, pend
) ? 2 : 1;
6593 if (u8
&& c
> 0x7F) { /* \u notation */
6594 int n
= rb_enc_precise_mbclen(p
-1, pend
, enc
);
6595 if (MBCLEN_CHARFOUND_P(n
)) {
6596 unsigned int cc
= rb_enc_mbc_to_codepoint(p
-1, pend
, enc
);
6598 clen
= 6; /* \uXXXX */
6599 else if (cc
<= 0xFFFFF)
6600 clen
= 9; /* \u{XXXXX} */
6602 clen
= 10; /* \u{XXXXXX} */
6603 p
+= MBCLEN_CHARFOUND_LEN(n
)-1;
6607 clen
= 4; /* \xNN */
6612 if (clen
> LONG_MAX
- len
) {
6613 rb_raise(rb_eRuntimeError
, "string size too big");
6618 result
= rb_str_new(0, len
);
6619 p
= RSTRING_PTR(str
); pend
= p
+ RSTRING_LEN(str
);
6620 q
= RSTRING_PTR(result
); qend
= q
+ len
+ 1;
6624 unsigned char c
= *p
++;
6626 if (c
== '"' || c
== '\\') {
6630 else if (c
== '#') {
6631 if (IS_EVSTR(p
, pend
)) *q
++ = '\\';
6634 else if (c
== '\n') {
6638 else if (c
== '\r') {
6642 else if (c
== '\t') {
6646 else if (c
== '\f') {
6650 else if (c
== '\013') {
6654 else if (c
== '\010') {
6658 else if (c
== '\007') {
6662 else if (c
== '\033') {
6666 else if (ISPRINT(c
)) {
6672 int n
= rb_enc_precise_mbclen(p
-1, pend
, enc
) - 1;
6673 if (MBCLEN_CHARFOUND_P(n
)) {
6674 int cc
= rb_enc_mbc_to_codepoint(p
-1, pend
, enc
);
6677 snprintf(q
, qend
-q
, "u%04X", cc
); /* \uXXXX */
6679 snprintf(q
, qend
-q
, "u{%X}", cc
); /* \u{XXXXX} or \u{XXXXXX} */
6684 snprintf(q
, qend
-q
, "x%02X", c
);
6690 if (!rb_enc_asciicompat(enc
)) {
6691 snprintf(q
, qend
-q
, nonascii_suffix
, enc
->name
);
6692 encidx
= rb_ascii8bit_encindex();
6694 /* result from dump is ASCII */
6695 rb_enc_associate_index(result
, encidx
);
6696 ENC_CODERANGE_SET(result
, ENC_CODERANGE_7BIT
);
6701 unescape_ascii(unsigned int c
)
6721 UNREACHABLE_RETURN(-1);
6725 undump_after_backslash(VALUE undumped
, const char **ss
, const char *s_end
, rb_encoding
**penc
, bool *utf8
, bool *binary
)
6727 const char *s
= *ss
;
6731 unsigned char buf
[6];
6732 static rb_encoding
*enc_utf8
= NULL
;
6738 rb_str_cat(undumped
, s
, 1); /* cat itself */
6749 *buf
= unescape_ascii(*s
);
6750 rb_str_cat(undumped
, (char *)buf
, 1);
6755 rb_raise(rb_eRuntimeError
, "hex escape and Unicode escape are mixed");
6759 rb_raise(rb_eRuntimeError
, "invalid Unicode escape");
6761 if (enc_utf8
== NULL
) enc_utf8
= rb_utf8_encoding();
6762 if (*penc
!= enc_utf8
) {
6764 rb_enc_associate(undumped
, enc_utf8
);
6766 if (*s
== '{') { /* handle \u{...} form */
6770 rb_raise(rb_eRuntimeError
, "unterminated Unicode escape");
6780 c
= scan_hex(s
, s_end
-s
, &hexlen
);
6781 if (hexlen
== 0 || hexlen
> 6) {
6782 rb_raise(rb_eRuntimeError
, "invalid Unicode escape");
6785 rb_raise(rb_eRuntimeError
, "invalid Unicode codepoint (too large)");
6787 if (0xd800 <= c
&& c
<= 0xdfff) {
6788 rb_raise(rb_eRuntimeError
, "invalid Unicode codepoint");
6790 codelen
= rb_enc_mbcput(c
, (char *)buf
, *penc
);
6791 rb_str_cat(undumped
, (char *)buf
, codelen
);
6795 else { /* handle \uXXXX form */
6796 c
= scan_hex(s
, 4, &hexlen
);
6798 rb_raise(rb_eRuntimeError
, "invalid Unicode escape");
6800 if (0xd800 <= c
&& c
<= 0xdfff) {
6801 rb_raise(rb_eRuntimeError
, "invalid Unicode codepoint");
6803 codelen
= rb_enc_mbcput(c
, (char *)buf
, *penc
);
6804 rb_str_cat(undumped
, (char *)buf
, codelen
);
6810 rb_raise(rb_eRuntimeError
, "hex escape and Unicode escape are mixed");
6814 rb_raise(rb_eRuntimeError
, "invalid hex escape");
6816 *buf
= scan_hex(s
, 2, &hexlen
);
6818 rb_raise(rb_eRuntimeError
, "invalid hex escape");
6820 rb_str_cat(undumped
, (char *)buf
, 1);
6824 rb_str_cat(undumped
, s
-1, 2);
6831 static VALUE
rb_str_is_ascii_only_p(VALUE str
);
6837 * Returns an unescaped version of +self+:
6839 * s_orig = "\f\x00\xff\\\"" # => "\f\u0000\xFF\\\""
6840 * s_dumped = s_orig.dump # => "\"\\f\\x00\\xFF\\\\\\\"\""
6841 * s_undumped = s_dumped.undump # => "\f\u0000\xFF\\\""
6842 * s_undumped == s_orig # => true
6844 * Related: String#dump (inverse of String#undump).
6849 str_undump(VALUE str
)
6851 const char *s
= RSTRING_PTR(str
);
6852 const char *s_end
= RSTRING_END(str
);
6853 rb_encoding
*enc
= rb_enc_get(str
);
6854 VALUE undumped
= rb_enc_str_new(s
, 0L, enc
);
6856 bool binary
= false;
6859 rb_must_asciicompat(str
);
6860 if (rb_str_is_ascii_only_p(str
) == Qfalse
) {
6861 rb_raise(rb_eRuntimeError
, "non-ASCII character detected");
6863 if (!str_null_check(str
, &w
)) {
6864 rb_raise(rb_eRuntimeError
, "string contains null byte");
6866 if (RSTRING_LEN(str
) < 2) goto invalid_format
;
6867 if (*s
!= '"') goto invalid_format
;
6869 /* strip '"' at the start */
6874 rb_raise(rb_eRuntimeError
, "unterminated dumped string");
6881 /* ascii compatible dumped string */
6885 static const char force_encoding_suffix
[] = ".force_encoding(\""; /* "\")" */
6886 static const char dup_suffix
[] = ".dup";
6887 const char *encname
;
6891 /* check separately for strings dumped by older versions */
6892 size
= sizeof(dup_suffix
) - 1;
6893 if (s_end
- s
> size
&& memcmp(s
, dup_suffix
, size
) == 0) s
+= size
;
6895 size
= sizeof(force_encoding_suffix
) - 1;
6896 if (s_end
- s
<= size
) goto invalid_format
;
6897 if (memcmp(s
, force_encoding_suffix
, size
) != 0) goto invalid_format
;
6901 rb_raise(rb_eRuntimeError
, "dumped string contained Unicode escape but used force_encoding");
6905 s
= memchr(s
, '"', s_end
-s
);
6907 if (!s
) goto invalid_format
;
6908 if (s_end
- s
!= 2) goto invalid_format
;
6909 if (s
[0] != '"' || s
[1] != ')') goto invalid_format
;
6911 encidx
= rb_enc_find_index2(encname
, (long)size
);
6913 rb_raise(rb_eRuntimeError
, "dumped string has unknown encoding name");
6915 rb_enc_associate_index(undumped
, encidx
);
6923 rb_raise(rb_eRuntimeError
, "invalid escape");
6925 undump_after_backslash(undumped
, &s
, s_end
, &enc
, &utf8
, &binary
);
6928 rb_str_cat(undumped
, s
++, 1);
6934 rb_raise(rb_eRuntimeError
, "invalid dumped string; not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form");
6938 rb_str_check_dummy_enc(rb_encoding
*enc
)
6940 if (rb_enc_dummy_p(enc
)) {
6941 rb_raise(rb_eEncCompatError
, "incompatible encoding with this operation: %s",
6946 static rb_encoding
*
6947 str_true_enc(VALUE str
)
6949 rb_encoding
*enc
= STR_ENC_GET(str
);
6950 rb_str_check_dummy_enc(enc
);
6954 static OnigCaseFoldType
6955 check_case_options(int argc
, VALUE
*argv
, OnigCaseFoldType flags
)
6960 rb_raise(rb_eArgError
, "too many options");
6961 if (argv
[0]==sym_turkic
) {
6962 flags
|= ONIGENC_CASE_FOLD_TURKISH_AZERI
;
6964 if (argv
[1]==sym_lithuanian
)
6965 flags
|= ONIGENC_CASE_FOLD_LITHUANIAN
;
6967 rb_raise(rb_eArgError
, "invalid second option");
6970 else if (argv
[0]==sym_lithuanian
) {
6971 flags
|= ONIGENC_CASE_FOLD_LITHUANIAN
;
6973 if (argv
[1]==sym_turkic
)
6974 flags
|= ONIGENC_CASE_FOLD_TURKISH_AZERI
;
6976 rb_raise(rb_eArgError
, "invalid second option");
6980 rb_raise(rb_eArgError
, "too many options");
6981 else if (argv
[0]==sym_ascii
)
6982 flags
|= ONIGENC_CASE_ASCII_ONLY
;
6983 else if (argv
[0]==sym_fold
) {
6984 if ((flags
& (ONIGENC_CASE_UPCASE
|ONIGENC_CASE_DOWNCASE
)) == ONIGENC_CASE_DOWNCASE
)
6985 flags
^= ONIGENC_CASE_FOLD
|ONIGENC_CASE_DOWNCASE
;
6987 rb_raise(rb_eArgError
, "option :fold only allowed for downcasing");
6990 rb_raise(rb_eArgError
, "invalid option");
6995 case_option_single_p(OnigCaseFoldType flags
, rb_encoding
*enc
, VALUE str
)
6997 if ((flags
& ONIGENC_CASE_ASCII_ONLY
) && (enc
==rb_utf8_encoding() || rb_enc_mbmaxlen(enc
) == 1))
6999 return !(flags
& ONIGENC_CASE_FOLD_TURKISH_AZERI
) && ENC_CODERANGE(str
) == ENC_CODERANGE_7BIT
;
7002 /* 16 should be long enough to absorb any kind of single character length increase */
7003 #define CASE_MAPPING_ADDITIONAL_LENGTH 20
7004 #ifndef CASEMAP_DEBUG
7005 # define CASEMAP_DEBUG 0
7008 struct mapping_buffer
;
7009 typedef struct mapping_buffer
{
7012 struct mapping_buffer
*next
;
7013 OnigUChar space
[FLEX_ARY_LEN
];
7017 mapping_buffer_free(void *p
)
7019 mapping_buffer
*previous_buffer
;
7020 mapping_buffer
*current_buffer
= p
;
7021 while (current_buffer
) {
7022 previous_buffer
= current_buffer
;
7023 current_buffer
= current_buffer
->next
;
7024 ruby_sized_xfree(previous_buffer
, previous_buffer
->capa
);
7028 static const rb_data_type_t mapping_buffer_type
= {
7030 {0, mapping_buffer_free
,}
7034 rb_str_casemap(VALUE source
, OnigCaseFoldType
*flags
, rb_encoding
*enc
)
7038 const OnigUChar
*source_current
, *source_end
;
7039 int target_length
= 0;
7040 VALUE buffer_anchor
;
7041 mapping_buffer
*current_buffer
= 0;
7042 mapping_buffer
**pre_buffer
;
7043 size_t buffer_count
= 0;
7044 int buffer_length_or_invalid
;
7046 if (RSTRING_LEN(source
) == 0) return str_duplicate(rb_cString
, source
);
7048 source_current
= (OnigUChar
*)RSTRING_PTR(source
);
7049 source_end
= (OnigUChar
*)RSTRING_END(source
);
7051 buffer_anchor
= TypedData_Wrap_Struct(0, &mapping_buffer_type
, 0);
7052 pre_buffer
= (mapping_buffer
**)&DATA_PTR(buffer_anchor
);
7053 while (source_current
< source_end
) {
7054 /* increase multiplier using buffer count to converge quickly */
7055 size_t capa
= (size_t)(source_end
-source_current
)*++buffer_count
+ CASE_MAPPING_ADDITIONAL_LENGTH
;
7056 if (CASEMAP_DEBUG
) {
7057 fprintf(stderr
, "Buffer allocation, capa is %"PRIuSIZE
"\n", capa
); /* for tuning */
7059 current_buffer
= xmalloc(offsetof(mapping_buffer
, space
) + capa
);
7060 *pre_buffer
= current_buffer
;
7061 pre_buffer
= ¤t_buffer
->next
;
7062 current_buffer
->next
= NULL
;
7063 current_buffer
->capa
= capa
;
7064 buffer_length_or_invalid
= enc
->case_map(flags
,
7065 &source_current
, source_end
,
7066 current_buffer
->space
,
7067 current_buffer
->space
+current_buffer
->capa
,
7069 if (buffer_length_or_invalid
< 0) {
7070 current_buffer
= DATA_PTR(buffer_anchor
);
7071 DATA_PTR(buffer_anchor
) = 0;
7072 mapping_buffer_free(current_buffer
);
7073 rb_raise(rb_eArgError
, "input string invalid");
7075 target_length
+= current_buffer
->used
= buffer_length_or_invalid
;
7077 if (CASEMAP_DEBUG
) {
7078 fprintf(stderr
, "Buffer count is %"PRIuSIZE
"\n", buffer_count
); /* for tuning */
7081 if (buffer_count
==1) {
7082 target
= rb_str_new((const char*)current_buffer
->space
, target_length
);
7085 char *target_current
;
7087 target
= rb_str_new(0, target_length
);
7088 target_current
= RSTRING_PTR(target
);
7089 current_buffer
= DATA_PTR(buffer_anchor
);
7090 while (current_buffer
) {
7091 memcpy(target_current
, current_buffer
->space
, current_buffer
->used
);
7092 target_current
+= current_buffer
->used
;
7093 current_buffer
= current_buffer
->next
;
7096 current_buffer
= DATA_PTR(buffer_anchor
);
7097 DATA_PTR(buffer_anchor
) = 0;
7098 mapping_buffer_free(current_buffer
);
7100 /* TODO: check about string terminator character */
7101 str_enc_copy(target
, source
);
7102 /*ENC_CODERANGE_SET(mapped, cr);*/
7108 rb_str_ascii_casemap(VALUE source
, VALUE target
, OnigCaseFoldType
*flags
, rb_encoding
*enc
)
7110 const OnigUChar
*source_current
, *source_end
;
7111 OnigUChar
*target_current
, *target_end
;
7112 long old_length
= RSTRING_LEN(source
);
7113 int length_or_invalid
;
7115 if (old_length
== 0) return Qnil
;
7117 source_current
= (OnigUChar
*)RSTRING_PTR(source
);
7118 source_end
= (OnigUChar
*)RSTRING_END(source
);
7119 if (source
== target
) {
7120 target_current
= (OnigUChar
*)source_current
;
7121 target_end
= (OnigUChar
*)source_end
;
7124 target_current
= (OnigUChar
*)RSTRING_PTR(target
);
7125 target_end
= (OnigUChar
*)RSTRING_END(target
);
7128 length_or_invalid
= onigenc_ascii_only_case_map(flags
,
7129 &source_current
, source_end
,
7130 target_current
, target_end
, enc
);
7131 if (length_or_invalid
< 0)
7132 rb_raise(rb_eArgError
, "input string invalid");
7133 if (CASEMAP_DEBUG
&& length_or_invalid
!= old_length
) {
7134 fprintf(stderr
, "problem with rb_str_ascii_casemap"
7135 "; old_length=%ld, new_length=%d\n", old_length
, length_or_invalid
);
7136 rb_raise(rb_eArgError
, "internal problem with rb_str_ascii_casemap"
7137 "; old_length=%ld, new_length=%d\n", old_length
, length_or_invalid
);
7140 str_enc_copy(target
, source
);
7146 upcase_single(VALUE str
)
7148 char *s
= RSTRING_PTR(str
), *send
= RSTRING_END(str
);
7149 bool modified
= false;
7152 unsigned int c
= *(unsigned char*)s
;
7154 if ('a' <= c
&& c
<= 'z') {
7155 *s
= 'A' + (c
- 'a');
7165 * upcase!(*options) -> self or nil
7167 * Upcases the characters in +self+;
7168 * returns +self+ if any changes were made, +nil+ otherwise:
7170 * s = 'Hello World!' # => "Hello World!"
7171 * s.upcase! # => "HELLO WORLD!"
7172 * s # => "HELLO WORLD!"
7173 * s.upcase! # => nil
7175 * The casing may be affected by the given +options+;
7176 * see {Case Mapping}[doc/case_mapping_rdoc.html].
7178 * Related: String#upcase, String#downcase, String#downcase!.
7183 rb_str_upcase_bang(int argc
, VALUE
*argv
, VALUE str
)
7186 OnigCaseFoldType flags
= ONIGENC_CASE_UPCASE
;
7188 flags
= check_case_options(argc
, argv
, flags
);
7189 str_modify_keep_cr(str
);
7190 enc
= str_true_enc(str
);
7191 if (case_option_single_p(flags
, enc
, str
)) {
7192 if (upcase_single(str
))
7193 flags
|= ONIGENC_CASE_MODIFIED
;
7195 else if (flags
&ONIGENC_CASE_ASCII_ONLY
)
7196 rb_str_ascii_casemap(str
, str
, &flags
, enc
);
7198 str_shared_replace(str
, rb_str_casemap(str
, &flags
, enc
));
7200 if (ONIGENC_CASE_MODIFIED
&flags
) return str
;
7207 * upcase(*options) -> string
7209 * Returns a string containing the upcased characters in +self+:
7211 * s = 'Hello World!' # => "Hello World!"
7212 * s.upcase # => "HELLO WORLD!"
7214 * The casing may be affected by the given +options+;
7215 * see {Case Mapping}[doc/case_mapping_rdoc.html].
7217 * Related: String#upcase!, String#downcase, String#downcase!.
7222 rb_str_upcase(int argc
, VALUE
*argv
, VALUE str
)
7225 OnigCaseFoldType flags
= ONIGENC_CASE_UPCASE
;
7228 flags
= check_case_options(argc
, argv
, flags
);
7229 enc
= str_true_enc(str
);
7230 if (case_option_single_p(flags
, enc
, str
)) {
7231 ret
= rb_str_new(RSTRING_PTR(str
), RSTRING_LEN(str
));
7232 str_enc_copy(ret
, str
);
7235 else if (flags
&ONIGENC_CASE_ASCII_ONLY
) {
7236 ret
= rb_str_new(0, RSTRING_LEN(str
));
7237 rb_str_ascii_casemap(str
, ret
, &flags
, enc
);
7240 ret
= rb_str_casemap(str
, &flags
, enc
);
7247 downcase_single(VALUE str
)
7249 char *s
= RSTRING_PTR(str
), *send
= RSTRING_END(str
);
7250 bool modified
= false;
7253 unsigned int c
= *(unsigned char*)s
;
7255 if ('A' <= c
&& c
<= 'Z') {
7256 *s
= 'a' + (c
- 'A');
7267 * downcase!(*options) -> self or nil
7269 * Downcases the characters in +self+;
7270 * returns +self+ if any changes were made, +nil+ otherwise:
7272 * s = 'Hello World!' # => "Hello World!"
7273 * s.downcase! # => "hello world!"
7274 * s # => "hello world!"
7275 * s.downcase! # => nil
7277 * The casing may be affected by the given +options+;
7278 * see {Case Mapping}[doc/case_mapping_rdoc.html].
7280 * Related: String#downcase, String#upcase, String#upcase!.
7285 rb_str_downcase_bang(int argc
, VALUE
*argv
, VALUE str
)
7288 OnigCaseFoldType flags
= ONIGENC_CASE_DOWNCASE
;
7290 flags
= check_case_options(argc
, argv
, flags
);
7291 str_modify_keep_cr(str
);
7292 enc
= str_true_enc(str
);
7293 if (case_option_single_p(flags
, enc
, str
)) {
7294 if (downcase_single(str
))
7295 flags
|= ONIGENC_CASE_MODIFIED
;
7297 else if (flags
&ONIGENC_CASE_ASCII_ONLY
)
7298 rb_str_ascii_casemap(str
, str
, &flags
, enc
);
7300 str_shared_replace(str
, rb_str_casemap(str
, &flags
, enc
));
7302 if (ONIGENC_CASE_MODIFIED
&flags
) return str
;
7309 * downcase(*options) -> string
7311 * Returns a string containing the downcased characters in +self+:
7313 * s = 'Hello World!' # => "Hello World!"
7314 * s.downcase # => "hello world!"
7316 * The casing may be affected by the given +options+;
7317 * see {Case Mapping}[doc/case_mapping_rdoc.html].
7319 * Related: String#downcase!, String#upcase, String#upcase!.
7324 rb_str_downcase(int argc
, VALUE
*argv
, VALUE str
)
7327 OnigCaseFoldType flags
= ONIGENC_CASE_DOWNCASE
;
7330 flags
= check_case_options(argc
, argv
, flags
);
7331 enc
= str_true_enc(str
);
7332 if (case_option_single_p(flags
, enc
, str
)) {
7333 ret
= rb_str_new(RSTRING_PTR(str
), RSTRING_LEN(str
));
7334 str_enc_copy(ret
, str
);
7335 downcase_single(ret
);
7337 else if (flags
&ONIGENC_CASE_ASCII_ONLY
) {
7338 ret
= rb_str_new(0, RSTRING_LEN(str
));
7339 rb_str_ascii_casemap(str
, ret
, &flags
, enc
);
7342 ret
= rb_str_casemap(str
, &flags
, enc
);
7351 * capitalize!(*options) -> self or nil
7353 * Upcases the first character in +self+;
7354 * downcases the remaining characters;
7355 * returns +self+ if any changes were made, +nil+ otherwise:
7357 * s = 'hello World!' # => "hello World!"
7358 * s.capitalize! # => "Hello world!"
7359 * s # => "Hello world!"
7360 * s.capitalize! # => nil
7362 * The casing may be affected by the given +options+;
7363 * see {Case Mapping}[doc/case_mapping_rdoc.html].
7365 * Related: String#capitalize.
7370 rb_str_capitalize_bang(int argc
, VALUE
*argv
, VALUE str
)
7373 OnigCaseFoldType flags
= ONIGENC_CASE_UPCASE
| ONIGENC_CASE_TITLECASE
;
7375 flags
= check_case_options(argc
, argv
, flags
);
7376 str_modify_keep_cr(str
);
7377 enc
= str_true_enc(str
);
7378 if (RSTRING_LEN(str
) == 0 || !RSTRING_PTR(str
)) return Qnil
;
7379 if (flags
&ONIGENC_CASE_ASCII_ONLY
)
7380 rb_str_ascii_casemap(str
, str
, &flags
, enc
);
7382 str_shared_replace(str
, rb_str_casemap(str
, &flags
, enc
));
7384 if (ONIGENC_CASE_MODIFIED
&flags
) return str
;
7391 * capitalize(*options) -> string
7393 * Returns a string containing the characters in +self+;
7394 * the first character is upcased;
7395 * the remaining characters are downcased:
7397 * s = 'hello World!' # => "hello World!"
7398 * s.capitalize # => "Hello world!"
7400 * The casing may be affected by the given +options+;
7401 * see {Case Mapping}[doc/case_mapping_rdoc.html].
7403 * Related: String#capitalize!.
7408 rb_str_capitalize(int argc
, VALUE
*argv
, VALUE str
)
7411 OnigCaseFoldType flags
= ONIGENC_CASE_UPCASE
| ONIGENC_CASE_TITLECASE
;
7414 flags
= check_case_options(argc
, argv
, flags
);
7415 enc
= str_true_enc(str
);
7416 if (RSTRING_LEN(str
) == 0 || !RSTRING_PTR(str
)) return str
;
7417 if (flags
&ONIGENC_CASE_ASCII_ONLY
) {
7418 ret
= rb_str_new(0, RSTRING_LEN(str
));
7419 rb_str_ascii_casemap(str
, ret
, &flags
, enc
);
7422 ret
= rb_str_casemap(str
, &flags
, enc
);
7430 * swapcase!(*options) -> self or nil
7432 * Upcases each lowercase character in +self+;
7433 * downcases uppercase character;
7434 * returns +self+ if any changes were made, +nil+ otherwise:
7436 * s = 'Hello World!' # => "Hello World!"
7437 * s.swapcase! # => "hELLO wORLD!"
7438 * s # => "Hello World!"
7439 * ''.swapcase! # => nil
7441 * The casing may be affected by the given +options+;
7442 * see {Case Mapping}[doc/case_mapping_rdoc.html].
7444 * Related: String#swapcase.
7449 rb_str_swapcase_bang(int argc
, VALUE
*argv
, VALUE str
)
7452 OnigCaseFoldType flags
= ONIGENC_CASE_UPCASE
| ONIGENC_CASE_DOWNCASE
;
7454 flags
= check_case_options(argc
, argv
, flags
);
7455 str_modify_keep_cr(str
);
7456 enc
= str_true_enc(str
);
7457 if (flags
&ONIGENC_CASE_ASCII_ONLY
)
7458 rb_str_ascii_casemap(str
, str
, &flags
, enc
);
7460 str_shared_replace(str
, rb_str_casemap(str
, &flags
, enc
));
7462 if (ONIGENC_CASE_MODIFIED
&flags
) return str
;
7469 * swapcase(*options) -> string
7471 * Returns a string containing the characters in +self+, with cases reversed;
7472 * each uppercase character is downcased;
7473 * each lowercase character is upcased:
7475 * s = 'Hello World!' # => "Hello World!"
7476 * s.swapcase # => "hELLO wORLD!"
7478 * The casing may be affected by the given +options+;
7479 * see {Case Mapping}[doc/case_mapping_rdoc.html].
7481 * Related: String#swapcase!.
7486 rb_str_swapcase(int argc
, VALUE
*argv
, VALUE str
)
7489 OnigCaseFoldType flags
= ONIGENC_CASE_UPCASE
| ONIGENC_CASE_DOWNCASE
;
7492 flags
= check_case_options(argc
, argv
, flags
);
7493 enc
= str_true_enc(str
);
7494 if (RSTRING_LEN(str
) == 0 || !RSTRING_PTR(str
)) return str_duplicate(rb_cString
, str
);
7495 if (flags
&ONIGENC_CASE_ASCII_ONLY
) {
7496 ret
= rb_str_new(0, RSTRING_LEN(str
));
7497 rb_str_ascii_casemap(str
, ret
, &flags
, enc
);
7500 ret
= rb_str_casemap(str
, &flags
, enc
);
7505 typedef unsigned char *USTR
;
7509 unsigned int now
, max
;
7514 trnext(struct tr
*t
, rb_encoding
*enc
)
7521 if (t
->p
== t
->pend
) return -1;
7522 if (rb_enc_ascget(t
->p
, t
->pend
, &n
, enc
) == '\\' && t
->p
+ n
< t
->pend
) {
7525 t
->now
= rb_enc_codepoint_len(t
->p
, t
->pend
, &n
, enc
);
7527 if (rb_enc_ascget(t
->p
, t
->pend
, &n
, enc
) == '-' && t
->p
+ n
< t
->pend
) {
7529 if (t
->p
< t
->pend
) {
7530 unsigned int c
= rb_enc_codepoint_len(t
->p
, t
->pend
, &n
, enc
);
7533 if (t
->now
< 0x80 && c
< 0x80) {
7534 rb_raise(rb_eArgError
,
7535 "invalid range \"%c-%c\" in string transliteration",
7539 rb_raise(rb_eArgError
, "invalid range in string transliteration");
7541 continue; /* not reached */
7550 while (ONIGENC_CODE_TO_MBCLEN(enc
, ++t
->now
) <= 0) {
7551 if (t
->now
== t
->max
) {
7556 if (t
->now
< t
->max
) {
7567 static VALUE
rb_str_delete_bang(int,VALUE
*,VALUE
);
7570 tr_trans(VALUE str
, VALUE src
, VALUE repl
, int sflag
)
7572 const unsigned int errc
= -1;
7573 unsigned int trans
[256];
7574 rb_encoding
*enc
, *e1
, *e2
;
7575 struct tr trsrc
, trrepl
;
7577 unsigned int c
, c0
, last
= 0;
7578 int modify
= 0, i
, l
;
7579 unsigned char *s
, *send
;
7581 int singlebyte
= single_byte_optimizable(str
);
7585 #define CHECK_IF_ASCII(c) \
7586 (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
7587 (cr = ENC_CODERANGE_VALID) : 0)
7591 if (RSTRING_LEN(str
) == 0 || !RSTRING_PTR(str
)) return Qnil
;
7592 if (RSTRING_LEN(repl
) == 0) {
7593 return rb_str_delete_bang(1, &src
, str
);
7596 cr
= ENC_CODERANGE(str
);
7597 e1
= rb_enc_check(str
, src
);
7598 e2
= rb_enc_check(str
, repl
);
7603 enc
= rb_enc_check(src
, repl
);
7605 trsrc
.p
= RSTRING_PTR(src
); trsrc
.pend
= trsrc
.p
+ RSTRING_LEN(src
);
7606 if (RSTRING_LEN(src
) > 1 &&
7607 rb_enc_ascget(trsrc
.p
, trsrc
.pend
, &l
, enc
) == '^' &&
7608 trsrc
.p
+ l
< trsrc
.pend
) {
7612 trrepl
.p
= RSTRING_PTR(repl
);
7613 trrepl
.pend
= trrepl
.p
+ RSTRING_LEN(repl
);
7614 trsrc
.gen
= trrepl
.gen
= 0;
7615 trsrc
.now
= trrepl
.now
= 0;
7616 trsrc
.max
= trrepl
.max
= 0;
7619 for (i
=0; i
<256; i
++) {
7622 while ((c
= trnext(&trsrc
, enc
)) != errc
) {
7627 if (!hash
) hash
= rb_hash_new();
7628 rb_hash_aset(hash
, UINT2NUM(c
), Qtrue
);
7631 while ((c
= trnext(&trrepl
, enc
)) != errc
)
7632 /* retrieve last replacer */;
7634 for (i
=0; i
<256; i
++) {
7635 if (trans
[i
] != errc
) {
7643 for (i
=0; i
<256; i
++) {
7646 while ((c
= trnext(&trsrc
, enc
)) != errc
) {
7647 r
= trnext(&trrepl
, enc
);
7648 if (r
== errc
) r
= trrepl
.now
;
7651 if (rb_enc_codelen(r
, enc
) != 1) singlebyte
= 0;
7654 if (!hash
) hash
= rb_hash_new();
7655 rb_hash_aset(hash
, UINT2NUM(c
), UINT2NUM(r
));
7660 if (cr
== ENC_CODERANGE_VALID
&& rb_enc_asciicompat(e1
))
7661 cr
= ENC_CODERANGE_7BIT
;
7662 str_modify_keep_cr(str
);
7663 s
= (unsigned char *)RSTRING_PTR(str
); send
= (unsigned char *)RSTRING_END(str
);
7664 termlen
= rb_enc_mbminlen(enc
);
7667 long offset
, max
= RSTRING_LEN(str
);
7668 unsigned int save
= -1;
7669 unsigned char *buf
= ALLOC_N(unsigned char, max
+ termlen
), *t
= buf
;
7674 c0
= c
= rb_enc_codepoint_len((char *)s
, (char *)send
, &clen
, e1
);
7675 tlen
= enc
== e1
? clen
: rb_enc_codelen(c
, enc
);
7682 VALUE tmp
= rb_hash_lookup(hash
, UINT2NUM(c
));
7684 if (cflag
) c
= last
;
7687 else if (cflag
) c
= errc
;
7688 else c
= NUM2INT(tmp
);
7693 if (c
!= (unsigned int)-1) {
7699 tlen
= rb_enc_codelen(c
, enc
);
7705 if (enc
!= e1
) may_modify
= 1;
7707 if ((offset
= t
- buf
) + tlen
> max
) {
7708 size_t MAYBE_UNUSED(old
) = max
+ termlen
;
7709 max
= offset
+ tlen
+ (send
- s
);
7710 SIZED_REALLOC_N(buf
, unsigned char, max
+ termlen
, old
);
7713 rb_enc_mbcput(c
, t
, enc
);
7714 if (may_modify
&& memcmp(s
, t
, tlen
) != 0) {
7720 if (!STR_EMBED_P(str
)) {
7721 ruby_sized_xfree(STR_HEAP_PTR(str
), STR_HEAP_SIZE(str
));
7723 TERM_FILL((char *)t
, termlen
);
7724 RSTRING(str
)->as
.heap
.ptr
= (char *)buf
;
7725 RSTRING(str
)->as
.heap
.len
= t
- buf
;
7726 STR_SET_NOEMBED(str
);
7727 RSTRING(str
)->as
.heap
.aux
.capa
= max
;
7729 else if (rb_enc_mbmaxlen(enc
) == 1 || (singlebyte
&& !hash
)) {
7731 c
= (unsigned char)*s
;
7732 if (trans
[c
] != errc
) {
7749 long offset
, max
= (long)((send
- s
) * 1.2);
7750 unsigned char *buf
= ALLOC_N(unsigned char, max
+ termlen
), *t
= buf
;
7754 c0
= c
= rb_enc_codepoint_len((char *)s
, (char *)send
, &clen
, e1
);
7755 tlen
= enc
== e1
? clen
: rb_enc_codelen(c
, enc
);
7761 VALUE tmp
= rb_hash_lookup(hash
, UINT2NUM(c
));
7763 if (cflag
) c
= last
;
7766 else if (cflag
) c
= errc
;
7767 else c
= NUM2INT(tmp
);
7770 c
= cflag
? last
: errc
;
7773 tlen
= rb_enc_codelen(c
, enc
);
7778 if (enc
!= e1
) may_modify
= 1;
7780 if ((offset
= t
- buf
) + tlen
> max
) {
7781 size_t MAYBE_UNUSED(old
) = max
+ termlen
;
7782 max
= offset
+ tlen
+ (long)((send
- s
) * 1.2);
7783 SIZED_REALLOC_N(buf
, unsigned char, max
+ termlen
, old
);
7787 rb_enc_mbcput(c
, t
, enc
);
7788 if (may_modify
&& memcmp(s
, t
, tlen
) != 0) {
7796 if (!STR_EMBED_P(str
)) {
7797 ruby_sized_xfree(STR_HEAP_PTR(str
), STR_HEAP_SIZE(str
));
7799 TERM_FILL((char *)t
, termlen
);
7800 RSTRING(str
)->as
.heap
.ptr
= (char *)buf
;
7801 RSTRING(str
)->as
.heap
.len
= t
- buf
;
7802 STR_SET_NOEMBED(str
);
7803 RSTRING(str
)->as
.heap
.aux
.capa
= max
;
7807 if (cr
!= ENC_CODERANGE_BROKEN
)
7808 ENC_CODERANGE_SET(str
, cr
);
7809 rb_enc_associate(str
, enc
);
7818 * str.tr!(from_str, to_str) -> str or nil
7820 * Translates <i>str</i> in place, using the same rules as
7821 * String#tr. Returns <i>str</i>, or <code>nil</code> if no changes
7826 rb_str_tr_bang(VALUE str
, VALUE src
, VALUE repl
)
7828 return tr_trans(str
, src
, repl
, 0);
7834 * str.tr(from_str, to_str) => new_str
7836 * Returns a copy of +str+ with the characters in +from_str+ replaced by the
7837 * corresponding characters in +to_str+. If +to_str+ is shorter than
7838 * +from_str+, it is padded with its last character in order to maintain the
7841 * "hello".tr('el', 'ip') #=> "hippo"
7842 * "hello".tr('aeiou', '*') #=> "h*ll*"
7843 * "hello".tr('aeiou', 'AA*') #=> "hAll*"
7845 * Both strings may use the <code>c1-c2</code> notation to denote ranges of
7846 * characters, and +from_str+ may start with a <code>^</code>, which denotes
7847 * all characters except those listed.
7849 * "hello".tr('a-y', 'b-z') #=> "ifmmp"
7850 * "hello".tr('^aeiou', '*') #=> "*e**o"
7852 * The backslash character <code>\\</code> can be used to escape
7853 * <code>^</code> or <code>-</code> and is otherwise ignored unless it
7854 * appears at the end of a range or the end of the +from_str+ or +to_str+:
7856 * "hello^world".tr("\\^aeiou", "*") #=> "h*ll**w*rld"
7857 * "hello-world".tr("a\\-eo", "*") #=> "h*ll**w*rld"
7859 * "hello\r\nworld".tr("\r", "") #=> "hello\nworld"
7860 * "hello\r\nworld".tr("\\r", "") #=> "hello\r\nwold"
7861 * "hello\r\nworld".tr("\\\r", "") #=> "hello\nworld"
7863 * "X['\\b']".tr("X\\", "") #=> "['b']"
7864 * "X['\\b']".tr("X-\\]", "") #=> "'b'"
7868 rb_str_tr(VALUE str
, VALUE src
, VALUE repl
)
7870 str
= str_duplicate(rb_cString
, str
);
7871 tr_trans(str
, src
, repl
, 0);
7875 #define TR_TABLE_MAX (UCHAR_MAX+1)
7876 #define TR_TABLE_SIZE (TR_TABLE_MAX+1)
7878 tr_setup_table(VALUE str
, char stable
[TR_TABLE_SIZE
], int first
,
7879 VALUE
*tablep
, VALUE
*ctablep
, rb_encoding
*enc
)
7881 const unsigned int errc
= -1;
7882 char buf
[TR_TABLE_MAX
];
7885 VALUE table
= 0, ptable
= 0;
7886 int i
, l
, cflag
= 0;
7888 tr
.p
= RSTRING_PTR(str
); tr
.pend
= tr
.p
+ RSTRING_LEN(str
);
7889 tr
.gen
= tr
.now
= tr
.max
= 0;
7891 if (RSTRING_LEN(str
) > 1 && rb_enc_ascget(tr
.p
, tr
.pend
, &l
, enc
) == '^') {
7896 for (i
=0; i
<TR_TABLE_MAX
; i
++) {
7899 stable
[TR_TABLE_MAX
] = cflag
;
7901 else if (stable
[TR_TABLE_MAX
] && !cflag
) {
7902 stable
[TR_TABLE_MAX
] = 0;
7904 for (i
=0; i
<TR_TABLE_MAX
; i
++) {
7908 while ((c
= trnext(&tr
, enc
)) != errc
) {
7909 if (c
< TR_TABLE_MAX
) {
7910 buf
[(unsigned char)c
] = !cflag
;
7913 VALUE key
= UINT2NUM(c
);
7915 if (!table
&& (first
|| *tablep
|| stable
[TR_TABLE_MAX
])) {
7918 table
= ptable
? ptable
: rb_hash_new();
7922 table
= rb_hash_new();
7927 if (table
&& (!ptable
|| (cflag
^ !NIL_P(rb_hash_aref(ptable
, key
))))) {
7928 rb_hash_aset(table
, key
, Qtrue
);
7932 for (i
=0; i
<TR_TABLE_MAX
; i
++) {
7933 stable
[i
] = stable
[i
] && buf
[i
];
7935 if (!table
&& !cflag
) {
7942 tr_find(unsigned int c
, const char table
[TR_TABLE_SIZE
], VALUE del
, VALUE nodel
)
7944 if (c
< TR_TABLE_MAX
) {
7945 return table
[c
] != 0;
7948 VALUE v
= UINT2NUM(c
);
7951 if (!NIL_P(rb_hash_lookup(del
, v
)) &&
7952 (!nodel
|| NIL_P(rb_hash_lookup(nodel
, v
)))) {
7956 else if (nodel
&& !NIL_P(rb_hash_lookup(nodel
, v
))) {
7959 return table
[TR_TABLE_MAX
] ? TRUE
: FALSE
;
7965 * str.delete!([other_str]+) -> str or nil
7967 * Performs a <code>delete</code> operation in place, returning <i>str</i>, or
7968 * <code>nil</code> if <i>str</i> was not modified.
7972 rb_str_delete_bang(int argc
, VALUE
*argv
, VALUE str
)
7974 char squeez
[TR_TABLE_SIZE
];
7975 rb_encoding
*enc
= 0;
7977 VALUE del
= 0, nodel
= 0;
7979 int i
, ascompat
, cr
;
7981 if (RSTRING_LEN(str
) == 0 || !RSTRING_PTR(str
)) return Qnil
;
7982 rb_check_arity(argc
, 1, UNLIMITED_ARGUMENTS
);
7983 for (i
=0; i
<argc
; i
++) {
7987 enc
= rb_enc_check(str
, s
);
7988 tr_setup_table(s
, squeez
, i
==0, &del
, &nodel
, enc
);
7991 str_modify_keep_cr(str
);
7992 ascompat
= rb_enc_asciicompat(enc
);
7993 s
= t
= RSTRING_PTR(str
);
7994 send
= RSTRING_END(str
);
7995 cr
= ascompat
? ENC_CODERANGE_7BIT
: ENC_CODERANGE_VALID
;
8000 if (ascompat
&& (c
= *(unsigned char*)s
) < 0x80) {
8011 c
= rb_enc_codepoint_len(s
, send
, &clen
, enc
);
8013 if (tr_find(c
, squeez
, del
, nodel
)) {
8017 if (t
!= s
) rb_enc_mbcput(c
, t
, enc
);
8019 if (cr
== ENC_CODERANGE_7BIT
) cr
= ENC_CODERANGE_VALID
;
8024 TERM_FILL(t
, TERM_LEN(str
));
8025 STR_SET_LEN(str
, t
- RSTRING_PTR(str
));
8026 ENC_CODERANGE_SET(str
, cr
);
8028 if (modify
) return str
;
8035 * str.delete([other_str]+) -> new_str
8037 * Returns a copy of <i>str</i> with all characters in the intersection of its
8038 * arguments deleted. Uses the same rules for building the set of characters as
8041 * "hello".delete "l","lo" #=> "heo"
8042 * "hello".delete "lo" #=> "he"
8043 * "hello".delete "aeiou", "^e" #=> "hell"
8044 * "hello".delete "ej-m" #=> "ho"
8048 rb_str_delete(int argc
, VALUE
*argv
, VALUE str
)
8050 str
= str_duplicate(rb_cString
, str
);
8051 rb_str_delete_bang(argc
, argv
, str
);
8058 * str.squeeze!([other_str]*) -> str or nil
8060 * Squeezes <i>str</i> in place, returning either <i>str</i>, or
8061 * <code>nil</code> if no changes were made.
8065 rb_str_squeeze_bang(int argc
, VALUE
*argv
, VALUE str
)
8067 char squeez
[TR_TABLE_SIZE
];
8068 rb_encoding
*enc
= 0;
8069 VALUE del
= 0, nodel
= 0;
8070 unsigned char *s
, *send
, *t
;
8072 int ascompat
, singlebyte
= single_byte_optimizable(str
);
8076 enc
= STR_ENC_GET(str
);
8079 for (i
=0; i
<argc
; i
++) {
8083 enc
= rb_enc_check(str
, s
);
8084 if (singlebyte
&& !single_byte_optimizable(s
))
8086 tr_setup_table(s
, squeez
, i
==0, &del
, &nodel
, enc
);
8090 str_modify_keep_cr(str
);
8091 s
= t
= (unsigned char *)RSTRING_PTR(str
);
8092 if (!s
|| RSTRING_LEN(str
) == 0) return Qnil
;
8093 send
= (unsigned char *)RSTRING_END(str
);
8095 ascompat
= rb_enc_asciicompat(enc
);
8099 unsigned int c
= *s
++;
8100 if (c
!= save
|| (argc
> 0 && !squeez
[c
])) {
8110 if (ascompat
&& (c
= *s
) < 0x80) {
8111 if (c
!= save
|| (argc
> 0 && !squeez
[c
])) {
8117 c
= rb_enc_codepoint_len((char *)s
, (char *)send
, &clen
, enc
);
8119 if (c
!= save
|| (argc
> 0 && !tr_find(c
, squeez
, del
, nodel
))) {
8120 if (t
!= s
) rb_enc_mbcput(c
, t
, enc
);
8129 TERM_FILL((char *)t
, TERM_LEN(str
));
8130 if ((char *)t
- RSTRING_PTR(str
) != RSTRING_LEN(str
)) {
8131 STR_SET_LEN(str
, (char *)t
- RSTRING_PTR(str
));
8135 if (modify
) return str
;
8142 * str.squeeze([other_str]*) -> new_str
8144 * Builds a set of characters from the <i>other_str</i> parameter(s)
8145 * using the procedure described for String#count. Returns a new
8146 * string where runs of the same character that occur in this set are
8147 * replaced by a single character. If no arguments are given, all
8148 * runs of identical characters are replaced by a single character.
8150 * "yellow moon".squeeze #=> "yelow mon"
8151 * " now is the".squeeze(" ") #=> " now is the"
8152 * "putters shoot balls".squeeze("m-z") #=> "puters shot balls"
8156 rb_str_squeeze(int argc
, VALUE
*argv
, VALUE str
)
8158 str
= str_duplicate(rb_cString
, str
);
8159 rb_str_squeeze_bang(argc
, argv
, str
);
8166 * str.tr_s!(from_str, to_str) -> str or nil
8168 * Performs String#tr_s processing on <i>str</i> in place,
8169 * returning <i>str</i>, or <code>nil</code> if no changes were made.
8173 rb_str_tr_s_bang(VALUE str
, VALUE src
, VALUE repl
)
8175 return tr_trans(str
, src
, repl
, 1);
8181 * str.tr_s(from_str, to_str) -> new_str
8183 * Processes a copy of <i>str</i> as described under String#tr, then
8184 * removes duplicate characters in regions that were affected by the
8187 * "hello".tr_s('l', 'r') #=> "hero"
8188 * "hello".tr_s('el', '*') #=> "h*o"
8189 * "hello".tr_s('el', 'hx') #=> "hhxo"
8193 rb_str_tr_s(VALUE str
, VALUE src
, VALUE repl
)
8195 str
= str_duplicate(rb_cString
, str
);
8196 tr_trans(str
, src
, repl
, 1);
8203 * str.count([other_str]+) -> integer
8205 * Each +other_str+ parameter defines a set of characters to count. The
8206 * intersection of these sets defines the characters to count in +str+. Any
8207 * +other_str+ that starts with a caret <code>^</code> is negated. The
8208 * sequence <code>c1-c2</code> means all characters between c1 and c2. The
8209 * backslash character <code>\\</code> can be used to escape <code>^</code> or
8210 * <code>-</code> and is otherwise ignored unless it appears at the end of a
8211 * sequence or the end of a +other_str+.
8214 * a.count "lo" #=> 5
8215 * a.count "lo", "o" #=> 2
8216 * a.count "hello", "^l" #=> 4
8217 * a.count "ej-m" #=> 4
8219 * "hello^world".count "\\^aeiou" #=> 4
8220 * "hello-world".count "a\\-eo" #=> 4
8222 * c = "hello world\\r\\n"
8223 * c.count "\\" #=> 2
8224 * c.count "\\A" #=> 0
8225 * c.count "X-\\w" #=> 3
8229 rb_str_count(int argc
, VALUE
*argv
, VALUE str
)
8231 char table
[TR_TABLE_SIZE
];
8232 rb_encoding
*enc
= 0;
8233 VALUE del
= 0, nodel
= 0, tstr
;
8239 rb_check_arity(argc
, 1, UNLIMITED_ARGUMENTS
);
8243 enc
= rb_enc_check(str
, tstr
);
8246 if (RSTRING_LEN(tstr
) == 1 && rb_enc_asciicompat(enc
) &&
8247 (ptstr
= RSTRING_PTR(tstr
),
8248 ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc
, (const unsigned char *)ptstr
, (const unsigned char *)ptstr
+1)) &&
8249 !is_broken_string(str
)) {
8251 unsigned char c
= rb_enc_codepoint_len(ptstr
, ptstr
+1, &clen
, enc
);
8253 s
= RSTRING_PTR(str
);
8254 if (!s
|| RSTRING_LEN(str
) == 0) return INT2FIX(0);
8255 send
= RSTRING_END(str
);
8257 if (*(unsigned char*)s
++ == c
) n
++;
8259 return SIZET2NUM(n
);
8263 tr_setup_table(tstr
, table
, TRUE
, &del
, &nodel
, enc
);
8264 for (i
=1; i
<argc
; i
++) {
8267 enc
= rb_enc_check(str
, tstr
);
8268 tr_setup_table(tstr
, table
, FALSE
, &del
, &nodel
, enc
);
8271 s
= RSTRING_PTR(str
);
8272 if (!s
|| RSTRING_LEN(str
) == 0) return INT2FIX(0);
8273 send
= RSTRING_END(str
);
8274 ascompat
= rb_enc_asciicompat(enc
);
8278 if (ascompat
&& (c
= *(unsigned char*)s
) < 0x80) {
8286 c
= rb_enc_codepoint_len(s
, send
, &clen
, enc
);
8287 if (tr_find(c
, table
, del
, nodel
)) {
8294 return SIZET2NUM(n
);
8298 rb_fs_check(VALUE val
)
8300 if (!NIL_P(val
) && !RB_TYPE_P(val
, T_STRING
) && !RB_TYPE_P(val
, T_REGEXP
)) {
8301 val
= rb_check_string_type(val
);
8302 if (NIL_P(val
)) return 0;
8307 static const char isspacetable
[256] = {
8308 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
8309 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8310 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8311 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8312 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8313 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8314 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8315 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8316 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8317 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8318 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8319 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8320 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8321 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8322 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8323 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
8326 #define ascii_isspace(c) isspacetable[(unsigned char)(c)]
8329 split_string(VALUE result
, VALUE str
, long beg
, long len
, long empty_count
)
8331 if (empty_count
>= 0 && len
== 0) {
8332 return empty_count
+ 1;
8334 if (empty_count
> 0) {
8335 /* make different substrings */
8338 rb_ary_push(result
, str_new_empty_String(str
));
8339 } while (--empty_count
> 0);
8343 rb_yield(str_new_empty_String(str
));
8344 } while (--empty_count
> 0);
8347 str
= rb_str_subseq(str
, beg
, len
);
8349 rb_ary_push(result
, str
);
8358 SPLIT_TYPE_AWK
, SPLIT_TYPE_STRING
, SPLIT_TYPE_REGEXP
, SPLIT_TYPE_CHARS
8362 literal_split_pattern(VALUE spat
, split_type_t default_type
)
8364 rb_encoding
*enc
= STR_ENC_GET(spat
);
8367 RSTRING_GETMEM(spat
, ptr
, len
);
8369 /* Special case - split into chars */
8370 return SPLIT_TYPE_CHARS
;
8372 else if (rb_enc_asciicompat(enc
)) {
8373 if (len
== 1 && ptr
[0] == ' ') {
8374 return SPLIT_TYPE_AWK
;
8379 if (rb_enc_ascget(ptr
, ptr
+ len
, &l
, enc
) == ' ' && len
== l
) {
8380 return SPLIT_TYPE_AWK
;
8383 return default_type
;
8388 * str.split(pattern=nil, [limit]) -> an_array
8389 * str.split(pattern=nil, [limit]) {|sub| block } -> str
8391 * Divides <i>str</i> into substrings based on a delimiter, returning an array
8392 * of these substrings.
8394 * If <i>pattern</i> is a String, then its contents are used as
8395 * the delimiter when splitting <i>str</i>. If <i>pattern</i> is a single
8396 * space, <i>str</i> is split on whitespace, with leading and trailing
8397 * whitespace and runs of contiguous whitespace characters ignored.
8399 * If <i>pattern</i> is a Regexp, <i>str</i> is divided where the
8400 * pattern matches. Whenever the pattern matches a zero-length string,
8401 * <i>str</i> is split into individual characters. If <i>pattern</i> contains
8402 * groups, the respective matches will be returned in the array as well.
8404 * If <i>pattern</i> is <code>nil</code>, the value of <code>$;</code> is used.
8405 * If <code>$;</code> is <code>nil</code> (which is the default), <i>str</i> is
8406 * split on whitespace as if ' ' were specified.
8408 * If the <i>limit</i> parameter is omitted, trailing null fields are
8409 * suppressed. If <i>limit</i> is a positive number, at most that number
8410 * of split substrings will be returned (captured groups will be returned
8411 * as well, but are not counted towards the limit).
8412 * If <i>limit</i> is <code>1</code>, the entire
8413 * string is returned as the only entry in an array. If negative, there is no
8414 * limit to the number of fields returned, and trailing null fields are not
8417 * When the input +str+ is empty an empty Array is returned as the string is
8418 * considered to have no fields to split.
8420 * " now's the time ".split #=> ["now's", "the", "time"]
8421 * " now's the time ".split(' ') #=> ["now's", "the", "time"]
8422 * " now's the time".split(/ /) #=> ["", "now's", "", "the", "time"]
8423 * "1, 2.34,56, 7".split(%r{,\s*}) #=> ["1", "2.34", "56", "7"]
8424 * "hello".split(//) #=> ["h", "e", "l", "l", "o"]
8425 * "hello".split(//, 3) #=> ["h", "e", "llo"]
8426 * "hi mom".split(%r{\s*}) #=> ["h", "i", "m", "o", "m"]
8428 * "mellow yellow".split("ello") #=> ["m", "w y", "w"]
8429 * "1,2,,3,4,,".split(',') #=> ["1", "2", "", "3", "4"]
8430 * "1,2,,3,4,,".split(',', 4) #=> ["1", "2", "", "3,4,,"]
8431 * "1,2,,3,4,,".split(',', -4) #=> ["1", "2", "", "3", "4", "", ""]
8433 * "1:2:3".split(/(:)()()/, 2) #=> ["1", ":", "", "", "2:3"]
8435 * "".split(',', -1) #=> []
8437 * If a block is given, invoke the block with each split substring.
8442 rb_str_split_m(int argc
, VALUE
*argv
, VALUE str
)
8447 split_type_t split_type
;
8448 long beg
, end
, i
= 0, empty_count
= -1;
8452 result
= rb_block_given_p() ? Qfalse
: Qnil
;
8453 if (rb_scan_args(argc
, argv
, "02", &spat
, &limit
) == 2) {
8454 lim
= NUM2INT(limit
);
8455 if (lim
<= 0) limit
= Qnil
;
8456 else if (lim
== 1) {
8457 if (RSTRING_LEN(str
) == 0)
8458 return result
? rb_ary_new2(0) : str
;
8459 tmp
= str_duplicate(rb_cString
, str
);
8464 return rb_ary_new3(1, tmp
);
8468 if (NIL_P(limit
) && !lim
) empty_count
= 0;
8470 enc
= STR_ENC_GET(str
);
8471 split_type
= SPLIT_TYPE_REGEXP
;
8473 spat
= get_pat_quoted(spat
, 0);
8475 else if (NIL_P(spat
= rb_fs
)) {
8476 split_type
= SPLIT_TYPE_AWK
;
8478 else if (!(spat
= rb_fs_check(spat
))) {
8479 rb_raise(rb_eTypeError
, "value of $; must be String or Regexp");
8482 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED
, "$; is set to non-nil value");
8484 if (split_type
!= SPLIT_TYPE_AWK
) {
8485 switch (BUILTIN_TYPE(spat
)) {
8487 rb_reg_options(spat
); /* check if uninitialized */
8488 tmp
= RREGEXP_SRC(spat
);
8489 split_type
= literal_split_pattern(tmp
, SPLIT_TYPE_REGEXP
);
8490 if (split_type
== SPLIT_TYPE_AWK
) {
8492 split_type
= SPLIT_TYPE_STRING
;
8497 mustnot_broken(spat
);
8498 split_type
= literal_split_pattern(spat
, SPLIT_TYPE_STRING
);
8502 UNREACHABLE_RETURN(Qnil
);
8506 #define SPLIT_STR(beg, len) (empty_count = split_string(result, str, beg, len, empty_count))
8508 if (result
) result
= rb_ary_new();
8510 char *ptr
= RSTRING_PTR(str
);
8511 char *eptr
= RSTRING_END(str
);
8512 if (split_type
== SPLIT_TYPE_AWK
) {
8518 if (is_ascii_string(str
)) {
8519 while (ptr
< eptr
) {
8520 c
= (unsigned char)*ptr
++;
8522 if (ascii_isspace(c
)) {
8528 if (!NIL_P(limit
) && lim
<= i
) break;
8531 else if (ascii_isspace(c
)) {
8532 SPLIT_STR(beg
, end
-beg
);
8535 if (!NIL_P(limit
)) ++i
;
8543 while (ptr
< eptr
) {
8546 c
= rb_enc_codepoint_len(ptr
, eptr
, &n
, enc
);
8549 if (rb_isspace(c
)) {
8555 if (!NIL_P(limit
) && lim
<= i
) break;
8558 else if (rb_isspace(c
)) {
8559 SPLIT_STR(beg
, end
-beg
);
8562 if (!NIL_P(limit
)) ++i
;
8570 else if (split_type
== SPLIT_TYPE_STRING
) {
8571 char *str_start
= ptr
;
8572 char *substr_start
= ptr
;
8573 char *sptr
= RSTRING_PTR(spat
);
8574 long slen
= RSTRING_LEN(spat
);
8576 mustnot_broken(str
);
8577 enc
= rb_enc_check(str
, spat
);
8578 while (ptr
< eptr
&&
8579 (end
= rb_memsearch(sptr
, slen
, ptr
, eptr
- ptr
, enc
)) >= 0) {
8580 /* Check we are at the start of a char */
8581 char *t
= rb_enc_right_char_head(ptr
, ptr
+ end
, eptr
, enc
);
8582 if (t
!= ptr
+ end
) {
8586 SPLIT_STR(substr_start
- str_start
, (ptr
+end
) - substr_start
);
8589 if (!NIL_P(limit
) && lim
<= ++i
) break;
8591 beg
= ptr
- str_start
;
8593 else if (split_type
== SPLIT_TYPE_CHARS
) {
8594 char *str_start
= ptr
;
8597 mustnot_broken(str
);
8598 enc
= rb_enc_get(str
);
8599 while (ptr
< eptr
&&
8600 (n
= rb_enc_precise_mbclen(ptr
, eptr
, enc
)) > 0) {
8601 SPLIT_STR(ptr
- str_start
, n
);
8603 if (!NIL_P(limit
) && lim
<= ++i
) break;
8605 beg
= ptr
- str_start
;
8608 long len
= RSTRING_LEN(str
);
8612 struct re_registers
*regs
;
8615 for (; rb_reg_search(spat
, str
, start
, 0) >= 0;
8616 (match
? (rb_match_unbusy(match
), rb_backref_set(match
)) : (void)0)) {
8617 match
= rb_backref_get();
8618 if (!result
) rb_match_busy(match
);
8619 regs
= RMATCH_REGS(match
);
8621 if (start
== end
&& BEG(0) == END(0)) {
8626 else if (last_null
== 1) {
8627 SPLIT_STR(beg
, rb_enc_fast_mbclen(ptr
+beg
, eptr
, enc
));
8634 start
+= rb_enc_fast_mbclen(ptr
+start
,eptr
,enc
);
8640 SPLIT_STR(beg
, end
-beg
);
8641 beg
= start
= END(0);
8645 for (idx
=1; idx
< regs
->num_regs
; idx
++) {
8646 if (BEG(idx
) == -1) continue;
8647 SPLIT_STR(BEG(idx
), END(idx
)-BEG(idx
));
8649 if (!NIL_P(limit
) && lim
<= ++i
) break;
8651 if (match
) rb_match_unbusy(match
);
8653 if (RSTRING_LEN(str
) > 0 && (!NIL_P(limit
) || RSTRING_LEN(str
) > beg
|| lim
< 0)) {
8654 SPLIT_STR(beg
, RSTRING_LEN(str
)-beg
);
8657 return result
? result
: str
;
8661 rb_str_split(VALUE str
, const char *sep0
)
8666 sep
= rb_str_new_cstr(sep0
);
8667 return rb_str_split_m(1, &sep
, str
);
8670 #define WANTARRAY(m, size) (!rb_block_given_p() ? rb_ary_new_capa(size) : 0)
8673 enumerator_element(VALUE ary
, VALUE e
)
8676 rb_ary_push(ary
, e
);
8685 #define ENUM_ELEM(ary, e) enumerator_element(ary, e)
8688 chomp_newline(const char *p
, const char *e
, rb_encoding
*enc
)
8690 const char *prev
= rb_enc_prev_char(p
, e
, e
, enc
);
8691 if (rb_enc_is_newline(prev
, e
, enc
)) {
8693 prev
= rb_enc_prev_char(p
, e
, e
, enc
);
8694 if (prev
&& rb_enc_ascget(prev
, e
, NULL
, enc
) == '\r')
8705 (!RB_TYPE_P(rs
, T_STRING
) ||
8706 RSTRING_LEN(rs
) != 1 ||
8707 RSTRING_PTR(rs
)[0] != '\n')) {
8708 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED
, "$/ is set to non-default value");
8713 #define rb_rs get_rs()
8716 rb_str_enumerate_lines(int argc
, VALUE
*argv
, VALUE str
, VALUE ary
)
8719 VALUE line
, rs
, orig
= str
, opts
= Qnil
, chomp
= Qfalse
;
8720 const char *ptr
, *pend
, *subptr
, *subend
, *rsptr
, *hit
, *adjusted
;
8721 long pos
, len
, rslen
;
8724 if (rb_scan_args(argc
, argv
, "01:", &rs
, &opts
) == 0)
8727 static ID keywords
[1];
8729 keywords
[0] = rb_intern_const("chomp");
8731 rb_get_kwargs(opts
, keywords
, 0, 1, &chomp
);
8732 chomp
= (chomp
!= Qundef
&& RTEST(chomp
));
8736 if (!ENUM_ELEM(ary
, str
)) {
8744 if (!RSTRING_LEN(str
)) goto end
;
8745 str
= rb_str_new_frozen(str
);
8746 ptr
= subptr
= RSTRING_PTR(str
);
8747 pend
= RSTRING_END(str
);
8748 len
= RSTRING_LEN(str
);
8750 rslen
= RSTRING_LEN(rs
);
8752 if (rs
== rb_default_rs
)
8753 enc
= rb_enc_get(str
);
8755 enc
= rb_enc_check(str
, rs
);
8758 /* paragraph mode */
8760 const char *eol
= NULL
;
8762 while (subend
< pend
) {
8764 if (rb_enc_ascget(subend
, pend
, &n
, enc
) != '\r')
8766 rslen
= n
+ rb_enc_mbclen(subend
+ n
, pend
, enc
);
8767 if (rb_enc_is_newline(subend
+ n
, pend
, enc
)) {
8768 if (eol
== subend
) break;
8770 if (subptr
) eol
= subend
;
8773 if (!subptr
) subptr
= subend
;
8777 } while (subend
< pend
);
8779 line
= rb_str_subseq(str
, subptr
- ptr
,
8780 subend
- subptr
+ (chomp
? 0 : rslen
));
8781 if (ENUM_ELEM(ary
, line
)) {
8782 str_mod_check(str
, ptr
, len
);
8784 subptr
= eol
= NULL
;
8789 rsptr
= RSTRING_PTR(rs
);
8790 if (RSTRING_LEN(rs
) == rb_enc_mbminlen(enc
) &&
8791 rb_enc_is_newline(rsptr
, rsptr
+ RSTRING_LEN(rs
), enc
)) {
8796 if ((rs
== rb_default_rs
) && !rb_enc_asciicompat(enc
)) {
8797 rs
= rb_str_new(rsptr
, rslen
);
8798 rs
= rb_str_encode(rs
, rb_enc_from_encoding(enc
), 0, Qnil
);
8799 rsptr
= RSTRING_PTR(rs
);
8800 rslen
= RSTRING_LEN(rs
);
8803 while (subptr
< pend
) {
8804 pos
= rb_memsearch(rsptr
, rslen
, subptr
, pend
- subptr
, enc
);
8807 adjusted
= rb_enc_right_char_head(subptr
, hit
, pend
, enc
);
8808 if (hit
!= adjusted
) {
8812 subend
= hit
+= rslen
;
8815 subend
= chomp_newline(subptr
, subend
, enc
);
8821 line
= rb_str_subseq(str
, subptr
- ptr
, subend
- subptr
);
8822 if (ENUM_ELEM(ary
, line
)) {
8823 str_mod_check(str
, ptr
, len
);
8828 if (subptr
!= pend
) {
8831 pend
= chomp_newline(subptr
, pend
, enc
);
8833 else if (pend
- subptr
>= rslen
&&
8834 memcmp(pend
- rslen
, rsptr
, rslen
) == 0) {
8838 line
= rb_str_subseq(str
, subptr
- ptr
, pend
- subptr
);
8839 ENUM_ELEM(ary
, line
);
8852 * str.each_line(separator=$/, chomp: false) {|substr| block } -> str
8853 * str.each_line(separator=$/, chomp: false) -> an_enumerator
8855 * Splits <i>str</i> using the supplied parameter as the record
8856 * separator (<code>$/</code> by default), passing each substring in
8857 * turn to the supplied block. If a zero-length record separator is
8858 * supplied, the string is split into paragraphs delimited by
8859 * multiple successive newlines.
8861 * If +chomp+ is +true+, +separator+ will be removed from the end of each
8864 * If no block is given, an enumerator is returned instead.
8866 * "hello\nworld".each_line {|s| p s}
8871 * "hello\nworld".each_line('l') {|s| p s}
8878 * "hello\n\n\nworld".each_line('') {|s| p s}
8883 * "hello\nworld".each_line(chomp: true) {|s| p s}
8888 * "hello\nworld".each_line('l', chomp: true) {|s| p s}
8898 rb_str_each_line(int argc
, VALUE
*argv
, VALUE str
)
8900 RETURN_SIZED_ENUMERATOR(str
, argc
, argv
, 0);
8901 return rb_str_enumerate_lines(argc
, argv
, str
, 0);
8906 * str.lines(separator=$/, chomp: false) -> an_array
8908 * Returns an array of lines in <i>str</i> split using the supplied
8909 * record separator (<code>$/</code> by default). This is a
8910 * shorthand for <code>str.each_line(separator, getline_args).to_a</code>.
8912 * If +chomp+ is +true+, +separator+ will be removed from the end of each
8915 * "hello\nworld\n".lines #=> ["hello\n", "world\n"]
8916 * "hello world".lines(' ') #=> ["hello ", " ", "world"]
8917 * "hello\nworld\n".lines(chomp: true) #=> ["hello", "world"]
8919 * If a block is given, which is a deprecated form, works the same as
8920 * <code>each_line</code>.
8924 rb_str_lines(int argc
, VALUE
*argv
, VALUE str
)
8926 VALUE ary
= WANTARRAY("lines", 0);
8927 return rb_str_enumerate_lines(argc
, argv
, str
, ary
);
8931 rb_str_each_byte_size(VALUE str
, VALUE args
, VALUE eobj
)
8933 return LONG2FIX(RSTRING_LEN(str
));
8937 rb_str_enumerate_bytes(VALUE str
, VALUE ary
)
8941 for (i
=0; i
<RSTRING_LEN(str
); i
++) {
8942 ENUM_ELEM(ary
, INT2FIX((unsigned char)RSTRING_PTR(str
)[i
]));
8952 * str.each_byte {|integer| block } -> str
8953 * str.each_byte -> an_enumerator
8955 * Passes each byte in <i>str</i> to the given block, or returns an
8956 * enumerator if no block is given.
8958 * "hello".each_byte {|c| print c, ' ' }
8960 * <em>produces:</em>
8962 * 104 101 108 108 111
8966 rb_str_each_byte(VALUE str
)
8968 RETURN_SIZED_ENUMERATOR(str
, 0, 0, rb_str_each_byte_size
);
8969 return rb_str_enumerate_bytes(str
, 0);
8974 * str.bytes -> an_array
8976 * Returns an array of bytes in <i>str</i>. This is a shorthand for
8977 * <code>str.each_byte.to_a</code>.
8979 * If a block is given, which is a deprecated form, works the same as
8980 * <code>each_byte</code>.
8984 rb_str_bytes(VALUE str
)
8986 VALUE ary
= WANTARRAY("bytes", RSTRING_LEN(str
));
8987 return rb_str_enumerate_bytes(str
, ary
);
8991 rb_str_each_char_size(VALUE str
, VALUE args
, VALUE eobj
)
8993 return rb_str_length(str
);
8997 rb_str_enumerate_chars(VALUE str
, VALUE ary
)
9004 str
= rb_str_new_frozen(str
);
9005 ptr
= RSTRING_PTR(str
);
9006 len
= RSTRING_LEN(str
);
9007 enc
= rb_enc_get(str
);
9009 if (ENC_CODERANGE_CLEAN_P(ENC_CODERANGE(str
))) {
9010 for (i
= 0; i
< len
; i
+= n
) {
9011 n
= rb_enc_fast_mbclen(ptr
+ i
, ptr
+ len
, enc
);
9012 ENUM_ELEM(ary
, rb_str_subseq(str
, i
, n
));
9016 for (i
= 0; i
< len
; i
+= n
) {
9017 n
= rb_enc_mbclen(ptr
+ i
, ptr
+ len
, enc
);
9018 ENUM_ELEM(ary
, rb_str_subseq(str
, i
, n
));
9030 * str.each_char {|cstr| block } -> str
9031 * str.each_char -> an_enumerator
9033 * Passes each character in <i>str</i> to the given block, or returns
9034 * an enumerator if no block is given.
9036 * "hello".each_char {|c| print c, ' ' }
9038 * <em>produces:</em>
9044 rb_str_each_char(VALUE str
)
9046 RETURN_SIZED_ENUMERATOR(str
, 0, 0, rb_str_each_char_size
);
9047 return rb_str_enumerate_chars(str
, 0);
9052 * str.chars -> an_array
9054 * Returns an array of characters in <i>str</i>. This is a shorthand
9055 * for <code>str.each_char.to_a</code>.
9057 * If a block is given, which is a deprecated form, works the same as
9058 * <code>each_char</code>.
9062 rb_str_chars(VALUE str
)
9064 VALUE ary
= WANTARRAY("chars", rb_str_strlen(str
));
9065 return rb_str_enumerate_chars(str
, ary
);
9069 rb_str_enumerate_codepoints(VALUE str
, VALUE ary
)
9074 const char *ptr
, *end
;
9077 if (single_byte_optimizable(str
))
9078 return rb_str_enumerate_bytes(str
, ary
);
9080 str
= rb_str_new_frozen(str
);
9081 ptr
= RSTRING_PTR(str
);
9082 end
= RSTRING_END(str
);
9083 enc
= STR_ENC_GET(str
);
9086 c
= rb_enc_codepoint_len(ptr
, end
, &n
, enc
);
9087 ENUM_ELEM(ary
, UINT2NUM(c
));
9099 * str.each_codepoint {|integer| block } -> str
9100 * str.each_codepoint -> an_enumerator
9102 * Passes the Integer ordinal of each character in <i>str</i>,
9103 * also known as a <i>codepoint</i> when applied to Unicode strings to the
9104 * given block. For encodings other than UTF-8/UTF-16(BE|LE)/UTF-32(BE|LE),
9105 * values are directly derived from the binary representation
9106 * of each character.
9108 * If no block is given, an enumerator is returned instead.
9110 * "hello\u0639".each_codepoint {|c| print c, ' ' }
9112 * <em>produces:</em>
9114 * 104 101 108 108 111 1593
9118 rb_str_each_codepoint(VALUE str
)
9120 RETURN_SIZED_ENUMERATOR(str
, 0, 0, rb_str_each_char_size
);
9121 return rb_str_enumerate_codepoints(str
, 0);
9126 * str.codepoints -> an_array
9128 * Returns an array of the Integer ordinals of the
9129 * characters in <i>str</i>. This is a shorthand for
9130 * <code>str.each_codepoint.to_a</code>.
9132 * If a block is given, which is a deprecated form, works the same as
9133 * <code>each_codepoint</code>.
9137 rb_str_codepoints(VALUE str
)
9139 VALUE ary
= WANTARRAY("codepoints", rb_str_strlen(str
));
9140 return rb_str_enumerate_codepoints(str
, ary
);
9144 get_reg_grapheme_cluster(rb_encoding
*enc
)
9146 int encidx
= rb_enc_to_index(enc
);
9147 regex_t
*reg_grapheme_cluster
= NULL
;
9148 static regex_t
*reg_grapheme_cluster_utf8
= NULL
;
9151 if (encidx
== rb_utf8_encindex() && reg_grapheme_cluster_utf8
) {
9152 reg_grapheme_cluster
= reg_grapheme_cluster_utf8
;
9154 if (!reg_grapheme_cluster
) {
9155 const OnigUChar source_ascii
[] = "\\X";
9156 OnigErrorInfo einfo
;
9157 const OnigUChar
*source
= source_ascii
;
9158 size_t source_len
= sizeof(source_ascii
) - 1;
9160 #define CHARS_16BE(x) (OnigUChar)((x)>>8), (OnigUChar)(x)
9161 #define CHARS_16LE(x) (OnigUChar)(x), (OnigUChar)((x)>>8)
9162 #define CHARS_32BE(x) CHARS_16BE((x)>>16), CHARS_16BE(x)
9163 #define CHARS_32LE(x) CHARS_16LE(x), CHARS_16LE((x)>>16)
9164 #define CASE_UTF(e) \
9165 case ENCINDEX_UTF_##e: { \
9166 static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \
9167 source = source_UTF_##e; \
9168 source_len = sizeof(source_UTF_##e); \
9171 CASE_UTF(16BE
); CASE_UTF(16LE
); CASE_UTF(32BE
); CASE_UTF(32LE
);
9178 int r
= onig_new(®_grapheme_cluster
, source
, source
+ source_len
,
9179 ONIG_OPTION_DEFAULT
, enc
, OnigDefaultSyntax
, &einfo
);
9181 UChar message
[ONIG_MAX_ERROR_MESSAGE_LEN
];
9182 onig_error_code_to_str(message
, r
, &einfo
);
9183 rb_fatal("cannot compile grapheme cluster regexp: %s", (char *)message
);
9185 if (encidx
== rb_utf8_encindex()) {
9186 reg_grapheme_cluster_utf8
= reg_grapheme_cluster
;
9189 return reg_grapheme_cluster
;
9193 rb_str_each_grapheme_cluster_size(VALUE str
, VALUE args
, VALUE eobj
)
9195 size_t grapheme_cluster_count
= 0;
9196 regex_t
*reg_grapheme_cluster
= NULL
;
9197 rb_encoding
*enc
= rb_enc_from_index(ENCODING_GET(str
));
9198 const char *ptr
, *end
;
9200 if (!rb_enc_unicode_p(enc
)) {
9201 return rb_str_length(str
);
9204 reg_grapheme_cluster
= get_reg_grapheme_cluster(enc
);
9205 ptr
= RSTRING_PTR(str
);
9206 end
= RSTRING_END(str
);
9209 OnigPosition len
= onig_match(reg_grapheme_cluster
,
9210 (const OnigUChar
*)ptr
, (const OnigUChar
*)end
,
9211 (const OnigUChar
*)ptr
, NULL
, 0);
9212 if (len
<= 0) break;
9213 grapheme_cluster_count
++;
9217 return SIZET2NUM(grapheme_cluster_count
);
9221 rb_str_enumerate_grapheme_clusters(VALUE str
, VALUE ary
)
9224 regex_t
*reg_grapheme_cluster
= NULL
;
9225 rb_encoding
*enc
= rb_enc_from_index(ENCODING_GET(str
));
9226 const char *ptr0
, *ptr
, *end
;
9228 if (!rb_enc_unicode_p(enc
)) {
9229 return rb_str_enumerate_chars(str
, ary
);
9232 if (!ary
) str
= rb_str_new_frozen(str
);
9233 reg_grapheme_cluster
= get_reg_grapheme_cluster(enc
);
9234 ptr0
= ptr
= RSTRING_PTR(str
);
9235 end
= RSTRING_END(str
);
9238 OnigPosition len
= onig_match(reg_grapheme_cluster
,
9239 (const OnigUChar
*)ptr
, (const OnigUChar
*)end
,
9240 (const OnigUChar
*)ptr
, NULL
, 0);
9241 if (len
<= 0) break;
9242 ENUM_ELEM(ary
, rb_str_subseq(str
, ptr
-ptr0
, len
));
9254 * str.each_grapheme_cluster {|cstr| block } -> str
9255 * str.each_grapheme_cluster -> an_enumerator
9257 * Passes each grapheme cluster in <i>str</i> to the given block, or returns
9258 * an enumerator if no block is given.
9259 * Unlike String#each_char, this enumerates by grapheme clusters defined by
9260 * Unicode Standard Annex #29 http://unicode.org/reports/tr29/
9262 * "a\u0300".each_char.to_a.size #=> 2
9263 * "a\u0300".each_grapheme_cluster.to_a.size #=> 1
9268 rb_str_each_grapheme_cluster(VALUE str
)
9270 RETURN_SIZED_ENUMERATOR(str
, 0, 0, rb_str_each_grapheme_cluster_size
);
9271 return rb_str_enumerate_grapheme_clusters(str
, 0);
9276 * str.grapheme_clusters -> an_array
9278 * Returns an array of grapheme clusters in <i>str</i>. This is a shorthand
9279 * for <code>str.each_grapheme_cluster.to_a</code>.
9281 * If a block is given, which is a deprecated form, works the same as
9282 * <code>each_grapheme_cluster</code>.
9286 rb_str_grapheme_clusters(VALUE str
)
9288 VALUE ary
= WANTARRAY("grapheme_clusters", rb_str_strlen(str
));
9289 return rb_str_enumerate_grapheme_clusters(str
, ary
);
9293 chopped_length(VALUE str
)
9295 rb_encoding
*enc
= STR_ENC_GET(str
);
9296 const char *p
, *p2
, *beg
, *end
;
9298 beg
= RSTRING_PTR(str
);
9299 end
= beg
+ RSTRING_LEN(str
);
9300 if (beg
>= end
) return 0;
9301 p
= rb_enc_prev_char(beg
, end
, end
, enc
);
9303 if (p
> beg
&& rb_enc_ascget(p
, end
, 0, enc
) == '\n') {
9304 p2
= rb_enc_prev_char(beg
, p
, end
, enc
);
9305 if (p2
&& rb_enc_ascget(p2
, end
, 0, enc
) == '\r') p
= p2
;
9312 * str.chop! -> str or nil
9314 * Processes <i>str</i> as for String#chop, returning <i>str</i>, or
9315 * <code>nil</code> if <i>str</i> is the empty string. See also
9320 rb_str_chop_bang(VALUE str
)
9322 str_modify_keep_cr(str
);
9323 if (RSTRING_LEN(str
) > 0) {
9325 len
= chopped_length(str
);
9326 STR_SET_LEN(str
, len
);
9327 TERM_FILL(&RSTRING_PTR(str
)[len
], TERM_LEN(str
));
9328 if (ENC_CODERANGE(str
) != ENC_CODERANGE_7BIT
) {
9329 ENC_CODERANGE_CLEAR(str
);
9339 * str.chop -> new_str
9341 * Returns a new String with the last character removed. If the
9342 * string ends with <code>\r\n</code>, both characters are
9343 * removed. Applying <code>chop</code> to an empty string returns an
9344 * empty string. String#chomp is often a safer alternative, as it
9345 * leaves the string unchanged if it doesn't end in a record
9348 * "string\r\n".chop #=> "string"
9349 * "string\n\r".chop #=> "string\n"
9350 * "string\n".chop #=> "string"
9351 * "string".chop #=> "strin"
9352 * "x".chop.chop #=> ""
9356 rb_str_chop(VALUE str
)
9358 return rb_str_subseq(str
, 0, chopped_length(str
));
9362 smart_chomp(VALUE str
, const char *e
, const char *p
)
9364 rb_encoding
*enc
= rb_enc_get(str
);
9365 if (rb_enc_mbminlen(enc
) > 1) {
9366 const char *pp
= rb_enc_left_char_head(p
, e
-rb_enc_mbminlen(enc
), e
, enc
);
9367 if (rb_enc_is_newline(pp
, e
, enc
)) {
9370 pp
= e
- rb_enc_mbminlen(enc
);
9372 pp
= rb_enc_left_char_head(p
, pp
, e
, enc
);
9373 if (rb_enc_ascget(pp
, e
, 0, enc
) == '\r') {
9379 switch (*(e
-1)) { /* not e[-1] to get rid of VC bug */
9381 if (--e
> p
&& *(e
-1) == '\r') {
9394 chompped_length(VALUE str
, VALUE rs
)
9398 char *pp
, *e
, *rsptr
;
9400 char *const p
= RSTRING_PTR(str
);
9401 long len
= RSTRING_LEN(str
);
9403 if (len
== 0) return 0;
9405 if (rs
== rb_default_rs
) {
9406 return smart_chomp(str
, e
, p
);
9409 enc
= rb_enc_get(str
);
9410 RSTRING_GETMEM(rs
, rsptr
, rslen
);
9412 if (rb_enc_mbminlen(enc
) > 1) {
9414 pp
= rb_enc_left_char_head(p
, e
-rb_enc_mbminlen(enc
), e
, enc
);
9415 if (!rb_enc_is_newline(pp
, e
, enc
)) break;
9417 pp
-= rb_enc_mbminlen(enc
);
9419 pp
= rb_enc_left_char_head(p
, pp
, e
, enc
);
9420 if (rb_enc_ascget(pp
, e
, 0, enc
) == '\r') {
9427 while (e
> p
&& *(e
-1) == '\n') {
9429 if (e
> p
&& *(e
-1) == '\r')
9435 if (rslen
> len
) return len
;
9437 enc
= rb_enc_get(rs
);
9438 newline
= rsptr
[rslen
-1];
9439 if (rslen
== rb_enc_mbminlen(enc
)) {
9441 if (newline
== '\n')
9442 return smart_chomp(str
, e
, p
);
9445 if (rb_enc_is_newline(rsptr
, rsptr
+rslen
, enc
))
9446 return smart_chomp(str
, e
, p
);
9450 enc
= rb_enc_check(str
, rs
);
9451 if (is_broken_string(rs
)) {
9455 if (p
[len
-1] == newline
&&
9457 memcmp(rsptr
, pp
, rslen
) == 0)) {
9458 if (rb_enc_left_char_head(p
, pp
, e
, enc
) == pp
)
9466 * Returns the separator for arguments of rb_str_chomp.
9468 * @return returns rb_rs ($/) as default, the default value of rb_rs ($/) is "\n".
9471 chomp_rs(int argc
, const VALUE
*argv
)
9473 rb_check_arity(argc
, 0, 1);
9476 if (!NIL_P(rs
)) StringValue(rs
);
9485 rb_str_chomp_string(VALUE str
, VALUE rs
)
9487 long olen
= RSTRING_LEN(str
);
9488 long len
= chompped_length(str
, rs
);
9489 if (len
>= olen
) return Qnil
;
9490 str_modify_keep_cr(str
);
9491 STR_SET_LEN(str
, len
);
9492 TERM_FILL(&RSTRING_PTR(str
)[len
], TERM_LEN(str
));
9493 if (ENC_CODERANGE(str
) != ENC_CODERANGE_7BIT
) {
9494 ENC_CODERANGE_CLEAR(str
);
9501 * str.chomp!(separator=$/) -> str or nil
9503 * Modifies <i>str</i> in place as described for String#chomp,
9504 * returning <i>str</i>, or <code>nil</code> if no modifications were
9509 rb_str_chomp_bang(int argc
, VALUE
*argv
, VALUE str
)
9512 str_modifiable(str
);
9513 if (RSTRING_LEN(str
) == 0) return Qnil
;
9514 rs
= chomp_rs(argc
, argv
);
9515 if (NIL_P(rs
)) return Qnil
;
9516 return rb_str_chomp_string(str
, rs
);
9522 * str.chomp(separator=$/) -> new_str
9524 * Returns a new String with the given record separator removed
9525 * from the end of <i>str</i> (if present). If <code>$/</code> has not been
9526 * changed from the default Ruby record separator, then <code>chomp</code> also
9527 * removes carriage return characters (that is, it will remove <code>\n</code>,
9528 * <code>\r</code>, and <code>\r\n</code>). If <code>$/</code> is an empty string,
9529 * it will remove all trailing newlines from the string.
9531 * "hello".chomp #=> "hello"
9532 * "hello\n".chomp #=> "hello"
9533 * "hello\r\n".chomp #=> "hello"
9534 * "hello\n\r".chomp #=> "hello\n"
9535 * "hello\r".chomp #=> "hello"
9536 * "hello \n there".chomp #=> "hello \n there"
9537 * "hello".chomp("llo") #=> "he"
9538 * "hello\r\n\r\n".chomp('') #=> "hello"
9539 * "hello\r\n\r\r\n".chomp('') #=> "hello\r\n\r"
9543 rb_str_chomp(int argc
, VALUE
*argv
, VALUE str
)
9545 VALUE rs
= chomp_rs(argc
, argv
);
9546 if (NIL_P(rs
)) return str_duplicate(rb_cString
, str
);
9547 return rb_str_subseq(str
, 0, chompped_length(str
, rs
));
9551 lstrip_offset(VALUE str
, const char *s
, const char *e
, rb_encoding
*enc
)
9553 const char *const start
= s
;
9555 if (!s
|| s
>= e
) return 0;
9557 /* remove spaces at head */
9558 if (single_byte_optimizable(str
)) {
9559 while (s
< e
&& (*s
== '\0' || ascii_isspace(*s
))) s
++;
9564 unsigned int cc
= rb_enc_codepoint_len(s
, e
, &n
, enc
);
9566 if (cc
&& !rb_isspace(cc
)) break;
9575 * str.lstrip! -> self or nil
9577 * Removes leading whitespace from the receiver.
9578 * Returns the altered receiver, or +nil+ if no change was made.
9579 * See also String#rstrip! and String#strip!.
9581 * Refer to String#strip for the definition of whitespace.
9583 * " hello ".lstrip! #=> "hello "
9584 * "hello ".lstrip! #=> nil
9585 * "hello".lstrip! #=> nil
9589 rb_str_lstrip_bang(VALUE str
)
9595 str_modify_keep_cr(str
);
9596 enc
= STR_ENC_GET(str
);
9597 RSTRING_GETMEM(str
, start
, olen
);
9598 loffset
= lstrip_offset(str
, start
, start
+olen
, enc
);
9600 long len
= olen
-loffset
;
9601 s
= start
+ loffset
;
9602 memmove(start
, s
, len
);
9603 STR_SET_LEN(str
, len
);
9604 TERM_FILL(start
+len
, rb_enc_mbminlen(enc
));
9613 * str.lstrip -> new_str
9615 * Returns a copy of the receiver with leading whitespace removed.
9616 * See also String#rstrip and String#strip.
9618 * Refer to String#strip for the definition of whitespace.
9620 * " hello ".lstrip #=> "hello "
9621 * "hello".lstrip #=> "hello"
9625 rb_str_lstrip(VALUE str
)
9629 RSTRING_GETMEM(str
, start
, len
);
9630 loffset
= lstrip_offset(str
, start
, start
+len
, STR_ENC_GET(str
));
9631 if (loffset
<= 0) return str_duplicate(rb_cString
, str
);
9632 return rb_str_subseq(str
, loffset
, len
- loffset
);
9636 rstrip_offset(VALUE str
, const char *s
, const char *e
, rb_encoding
*enc
)
9640 rb_str_check_dummy_enc(enc
);
9641 if (!s
|| s
>= e
) return 0;
9644 /* remove trailing spaces or '\0's */
9645 if (single_byte_optimizable(str
)) {
9647 while (s
< t
&& ((c
= *(t
-1)) == '\0' || ascii_isspace(c
))) t
--;
9652 while ((tp
= rb_enc_prev_char(s
, t
, e
, enc
)) != NULL
) {
9653 unsigned int c
= rb_enc_codepoint(tp
, e
, enc
);
9654 if (c
&& !rb_isspace(c
)) break;
9663 * str.rstrip! -> self or nil
9665 * Removes trailing whitespace from the receiver.
9666 * Returns the altered receiver, or +nil+ if no change was made.
9667 * See also String#lstrip! and String#strip!.
9669 * Refer to String#strip for the definition of whitespace.
9671 * " hello ".rstrip! #=> " hello"
9672 * " hello".rstrip! #=> nil
9673 * "hello".rstrip! #=> nil
9677 rb_str_rstrip_bang(VALUE str
)
9683 str_modify_keep_cr(str
);
9684 enc
= STR_ENC_GET(str
);
9685 RSTRING_GETMEM(str
, start
, olen
);
9686 roffset
= rstrip_offset(str
, start
, start
+olen
, enc
);
9688 long len
= olen
- roffset
;
9690 STR_SET_LEN(str
, len
);
9691 TERM_FILL(start
+len
, rb_enc_mbminlen(enc
));
9700 * str.rstrip -> new_str
9702 * Returns a copy of the receiver with trailing whitespace removed.
9703 * See also String#lstrip and String#strip.
9705 * Refer to String#strip for the definition of whitespace.
9707 * " hello ".rstrip #=> " hello"
9708 * "hello".rstrip #=> "hello"
9712 rb_str_rstrip(VALUE str
)
9718 enc
= STR_ENC_GET(str
);
9719 RSTRING_GETMEM(str
, start
, olen
);
9720 roffset
= rstrip_offset(str
, start
, start
+olen
, enc
);
9722 if (roffset
<= 0) return str_duplicate(rb_cString
, str
);
9723 return rb_str_subseq(str
, 0, olen
-roffset
);
9729 * str.strip! -> self or nil
9731 * Removes leading and trailing whitespace from the receiver.
9732 * Returns the altered receiver, or +nil+ if there was no change.
9734 * Refer to String#strip for the definition of whitespace.
9736 * " hello ".strip! #=> "hello"
9737 * "hello".strip! #=> nil
9741 rb_str_strip_bang(VALUE str
)
9744 long olen
, loffset
, roffset
;
9747 str_modify_keep_cr(str
);
9748 enc
= STR_ENC_GET(str
);
9749 RSTRING_GETMEM(str
, start
, olen
);
9750 loffset
= lstrip_offset(str
, start
, start
+olen
, enc
);
9751 roffset
= rstrip_offset(str
, start
+loffset
, start
+olen
, enc
);
9753 if (loffset
> 0 || roffset
> 0) {
9754 long len
= olen
-roffset
;
9757 memmove(start
, start
+ loffset
, len
);
9759 STR_SET_LEN(str
, len
);
9760 TERM_FILL(start
+len
, rb_enc_mbminlen(enc
));
9769 * str.strip -> new_str
9771 * Returns a copy of the receiver with leading and trailing whitespace removed.
9773 * Whitespace is defined as any of the following characters:
9774 * null, horizontal tab, line feed, vertical tab, form feed, carriage return, space.
9776 * " hello ".strip #=> "hello"
9777 * "\tgoodbye\r\n".strip #=> "goodbye"
9778 * "\x00\t\n\v\f\r ".strip #=> ""
9779 * "hello".strip #=> "hello"
9783 rb_str_strip(VALUE str
)
9786 long olen
, loffset
, roffset
;
9787 rb_encoding
*enc
= STR_ENC_GET(str
);
9789 RSTRING_GETMEM(str
, start
, olen
);
9790 loffset
= lstrip_offset(str
, start
, start
+olen
, enc
);
9791 roffset
= rstrip_offset(str
, start
+loffset
, start
+olen
, enc
);
9793 if (loffset
<= 0 && roffset
<= 0) return str_duplicate(rb_cString
, str
);
9794 return rb_str_subseq(str
, loffset
, olen
-loffset
-roffset
);
9798 scan_once(VALUE str
, VALUE pat
, long *start
, int set_backref_str
)
9800 VALUE result
, match
;
9801 struct re_registers
*regs
;
9803 long end
, pos
= rb_pat_search(pat
, str
, *start
, set_backref_str
);
9805 if (BUILTIN_TYPE(pat
) == T_STRING
) {
9807 end
= pos
+ RSTRING_LEN(pat
);
9810 match
= rb_backref_get();
9811 regs
= RMATCH_REGS(match
);
9816 rb_encoding
*enc
= STR_ENC_GET(str
);
9818 * Always consume at least one character of the input string
9820 if (RSTRING_LEN(str
) > end
)
9821 *start
= end
+ rb_enc_fast_mbclen(RSTRING_PTR(str
) + end
,
9822 RSTRING_END(str
), enc
);
9829 if (!regs
|| regs
->num_regs
== 1) {
9830 result
= rb_str_subseq(str
, pos
, end
- pos
);
9833 result
= rb_ary_new2(regs
->num_regs
);
9834 for (i
=1; i
< regs
->num_regs
; i
++) {
9837 s
= rb_str_subseq(str
, BEG(i
), END(i
)-BEG(i
));
9839 rb_ary_push(result
, s
);
9850 * str.scan(pattern) -> array
9851 * str.scan(pattern) {|match, ...| block } -> str
9853 * Both forms iterate through <i>str</i>, matching the pattern (which may be a
9854 * Regexp or a String). For each match, a result is
9855 * generated and either added to the result array or passed to the block. If
9856 * the pattern contains no groups, each individual result consists of the
9857 * matched string, <code>$&</code>. If the pattern contains groups, each
9858 * individual result is itself an array containing one entry per group.
9861 * a.scan(/\w+/) #=> ["cruel", "world"]
9862 * a.scan(/.../) #=> ["cru", "el ", "wor"]
9863 * a.scan(/(...)/) #=> [["cru"], ["el "], ["wor"]]
9864 * a.scan(/(..)(..)/) #=> [["cr", "ue"], ["l ", "wo"]]
9866 * And the block form:
9868 * a.scan(/\w+/) {|w| print "<<#{w}>> " }
9870 * a.scan(/(.)(.)/) {|x,y| print y, x }
9873 * <em>produces:</em>
9875 * <<cruel>> <<world>>
9880 rb_str_scan(VALUE str
, VALUE pat
)
9884 long last
= -1, prev
= 0;
9885 char *p
= RSTRING_PTR(str
); long len
= RSTRING_LEN(str
);
9887 pat
= get_pat_quoted(pat
, 1);
9888 mustnot_broken(str
);
9889 if (!rb_block_given_p()) {
9890 VALUE ary
= rb_ary_new();
9892 while (!NIL_P(result
= scan_once(str
, pat
, &start
, 0))) {
9895 rb_ary_push(ary
, result
);
9897 if (last
>= 0) rb_pat_search(pat
, str
, last
, 1);
9898 else rb_backref_set(Qnil
);
9902 while (!NIL_P(result
= scan_once(str
, pat
, &start
, 1))) {
9906 str_mod_check(str
, p
, len
);
9908 if (last
>= 0) rb_pat_search(pat
, str
, last
, 1);
9915 * str.hex -> integer
9917 * Treats leading characters from <i>str</i> as a string of hexadecimal digits
9918 * (with an optional sign and an optional <code>0x</code>) and returns the
9919 * corresponding number. Zero is returned on error.
9922 * "-1234".hex #=> -4660
9924 * "wombat".hex #=> 0
9928 rb_str_hex(VALUE str
)
9930 return rb_str_to_inum(str
, 16, FALSE
);
9936 * str.oct -> integer
9938 * Treats leading characters of <i>str</i> as a string of octal digits (with an
9939 * optional sign) and returns the corresponding number. Returns 0 if the
9943 * "-377".oct #=> -255
9945 * "0377bad".oct #=> 255
9947 * If +str+ starts with <code>0</code>, radix indicators are honored.
9948 * See Kernel#Integer.
9952 rb_str_oct(VALUE str
)
9954 return rb_str_to_inum(str
, -8, FALSE
);
9957 #ifndef HAVE_CRYPT_R
9958 # include "ruby/thread_native.h"
9959 # include "ruby/atomic.h"
9962 rb_atomic_t initialized
;
9963 rb_nativethread_lock_t lock
;
9967 crypt_mutex_destroy(void)
9969 RUBY_ASSERT_ALWAYS(crypt_mutex
.initialized
== 1);
9970 rb_nativethread_lock_destroy(&crypt_mutex
.lock
);
9971 crypt_mutex
.initialized
= 0;
9975 crypt_mutex_initialize(void)
9978 while ((i
= RUBY_ATOMIC_CAS(crypt_mutex
.initialized
, 0, 2)) == 2);
9981 rb_nativethread_lock_initialize(&crypt_mutex
.lock
);
9982 atexit(crypt_mutex_destroy
);
9983 RUBY_ASSERT(crypt_mutex
.initialized
== 2);
9984 RUBY_ATOMIC_CAS(crypt_mutex
.initialized
, 2, 1);
9989 rb_bug("crypt_mutex.initialized: %d->%d", i
, crypt_mutex
.initialized
);
9996 * str.crypt(salt_str) -> new_str
9998 * Returns the string generated by calling <code>crypt(3)</code>
9999 * standard library function with <code>str</code> and
10000 * <code>salt_str</code>, in this order, as its arguments. Please do
10001 * not use this method any longer. It is legacy; provided only for
10002 * backward compatibility with ruby scripts in earlier days. It is
10003 * bad to use in contemporary programs for several reasons:
10005 * * Behaviour of C's <code>crypt(3)</code> depends on the OS it is
10006 * run. The generated string lacks data portability.
10008 * * On some OSes such as Mac OS, <code>crypt(3)</code> never fails
10009 * (i.e. silently ends up in unexpected results).
10011 * * On some OSes such as Mac OS, <code>crypt(3)</code> is not
10014 * * So-called "traditional" usage of <code>crypt(3)</code> is very
10015 * very very weak. According to its manpage, Linux's traditional
10016 * <code>crypt(3)</code> output has only 2**56 variations; too
10017 * easy to brute force today. And this is the default behaviour.
10019 * * In order to make things robust some OSes implement so-called
10020 * "modular" usage. To go through, you have to do a complex
10021 * build-up of the <code>salt_str</code> parameter, by hand.
10022 * Failure in generation of a proper salt string tends not to
10023 * yield any errors; typos in parameters are normally not
10026 * * For instance, in the following example, the second invocation
10027 * of String#crypt is wrong; it has a typo in "round=" (lacks
10028 * "s"). However the call does not fail and something unexpected
10031 * "foo".crypt("$5$rounds=1000$salt$") # OK, proper usage
10032 * "foo".crypt("$5$round=1000$salt$") # Typo not detected
10034 * * Even in the "modular" mode, some hash functions are considered
10035 * archaic and no longer recommended at all; for instance module
10036 * <code>$1$</code> is officially abandoned by its author: see
10037 * http://phk.freebsd.dk/sagas/md5crypt_eol/ . For another
10038 * instance module <code>$3$</code> is considered completely
10039 * broken: see the manpage of FreeBSD.
10041 * * On some OS such as Mac OS, there is no modular mode. Yet, as
10042 * written above, <code>crypt(3)</code> on Mac OS never fails.
10043 * This means even if you build up a proper salt string it
10044 * generates a traditional DES hash anyways, and there is no way
10045 * for you to be aware of.
10047 * "foo".crypt("$5$rounds=1000$salt$") # => "$5fNPQMxC5j6."
10049 * If for some reason you cannot migrate to other secure contemporary
10050 * password hashing algorithms, install the string-crypt gem and
10051 * <code>require 'string/crypt'</code> to continue using it.
10055 rb_str_crypt(VALUE str
, VALUE salt
)
10057 #ifdef HAVE_CRYPT_R
10059 struct crypt_data
*data
;
10060 # define CRYPT_END() ALLOCV_END(databuf)
10062 extern char *crypt(const char *, const char *);
10063 # define CRYPT_END() rb_nativethread_lock_unlock(&crypt_mutex.lock)
10066 const char *s
, *saltp
;
10068 #ifdef BROKEN_CRYPT
10069 char salt_8bit_clean
[3];
10073 mustnot_wchar(str
);
10074 mustnot_wchar(salt
);
10075 s
= StringValueCStr(str
);
10076 saltp
= RSTRING_PTR(salt
);
10077 if (RSTRING_LEN(salt
) < 2 || !saltp
[0] || !saltp
[1]) {
10078 rb_raise(rb_eArgError
, "salt too short (need >=2 bytes)");
10081 #ifdef BROKEN_CRYPT
10082 if (!ISASCII((unsigned char)saltp
[0]) || !ISASCII((unsigned char)saltp
[1])) {
10083 salt_8bit_clean
[0] = saltp
[0] & 0x7f;
10084 salt_8bit_clean
[1] = saltp
[1] & 0x7f;
10085 salt_8bit_clean
[2] = '\0';
10086 saltp
= salt_8bit_clean
;
10089 #ifdef HAVE_CRYPT_R
10090 data
= ALLOCV(databuf
, sizeof(struct crypt_data
));
10091 # ifdef HAVE_STRUCT_CRYPT_DATA_INITIALIZED
10092 data
->initialized
= 0;
10094 res
= crypt_r(s
, saltp
, data
);
10096 crypt_mutex_initialize();
10097 rb_nativethread_lock_lock(&crypt_mutex
.lock
);
10098 res
= crypt(s
, saltp
);
10103 rb_syserr_fail(err
, "crypt");
10105 result
= rb_str_new_cstr(res
);
10113 * str.ord -> integer
10115 * Returns the Integer ordinal of a one-character string.
10121 rb_str_ord(VALUE s
)
10125 c
= rb_enc_codepoint(RSTRING_PTR(s
), RSTRING_END(s
), STR_ENC_GET(s
));
10126 return UINT2NUM(c
);
10130 * str.sum(n=16) -> integer
10132 * Returns a basic <em>n</em>-bit checksum of the characters in <i>str</i>,
10133 * where <em>n</em> is the optional Integer parameter, defaulting
10134 * to 16. The result is simply the sum of the binary value of each byte in
10135 * <i>str</i> modulo <code>2**n - 1</code>. This is not a particularly good
10140 rb_str_sum(int argc
, VALUE
*argv
, VALUE str
)
10143 char *ptr
, *p
, *pend
;
10145 VALUE sum
= INT2FIX(0);
10146 unsigned long sum0
= 0;
10148 if (rb_check_arity(argc
, 0, 1) && (bits
= NUM2INT(argv
[0])) < 0) {
10151 ptr
= p
= RSTRING_PTR(str
);
10152 len
= RSTRING_LEN(str
);
10156 if (FIXNUM_MAX
- UCHAR_MAX
< sum0
) {
10157 sum
= rb_funcall(sum
, '+', 1, LONG2FIX(sum0
));
10158 str_mod_check(str
, ptr
, len
);
10161 sum0
+= (unsigned char)*p
;
10167 sum
= rb_funcall(sum
, '+', 1, LONG2FIX(sum0
));
10171 if (sum
== INT2FIX(0)) {
10172 if (bits
< (int)sizeof(long)*CHAR_BIT
) {
10173 sum0
&= (((unsigned long)1)<<bits
)-1;
10175 sum
= LONG2FIX(sum0
);
10181 sum
= rb_funcall(sum
, '+', 1, LONG2FIX(sum0
));
10184 mod
= rb_funcall(INT2FIX(1), idLTLT
, 1, INT2FIX(bits
));
10185 mod
= rb_funcall(mod
, '-', 1, INT2FIX(1));
10186 sum
= rb_funcall(sum
, '&', 1, mod
);
10193 rb_str_justify(int argc
, VALUE
*argv
, VALUE str
, char jflag
)
10197 long width
, len
, flen
= 1, fclen
= 1;
10200 const char *f
= " ";
10201 long n
, size
, llen
, rlen
, llen2
= 0, rlen2
= 0;
10203 int singlebyte
= 1, cr
;
10206 rb_scan_args(argc
, argv
, "11", &w
, &pad
);
10207 enc
= STR_ENC_GET(str
);
10208 termlen
= rb_enc_mbminlen(enc
);
10209 width
= NUM2LONG(w
);
10212 enc
= rb_enc_check(str
, pad
);
10213 f
= RSTRING_PTR(pad
);
10214 flen
= RSTRING_LEN(pad
);
10215 fclen
= str_strlen(pad
, enc
); /* rb_enc_check */
10216 singlebyte
= single_byte_optimizable(pad
);
10217 if (flen
== 0 || fclen
== 0) {
10218 rb_raise(rb_eArgError
, "zero width padding");
10221 len
= str_strlen(str
, enc
); /* rb_enc_check */
10222 if (width
< 0 || len
>= width
) return str_duplicate(rb_cString
, str
);
10224 llen
= (jflag
== 'l') ? 0 : ((jflag
== 'r') ? n
: n
/2);
10226 cr
= ENC_CODERANGE(str
);
10228 llen2
= str_offset(f
, f
+ flen
, llen
% fclen
, enc
, singlebyte
);
10229 rlen2
= str_offset(f
, f
+ flen
, rlen
% fclen
, enc
, singlebyte
);
10231 size
= RSTRING_LEN(str
);
10232 if ((len
= llen
/ fclen
+ rlen
/ fclen
) >= LONG_MAX
/ flen
||
10233 (len
*= flen
) >= LONG_MAX
- llen2
- rlen2
||
10234 (len
+= llen2
+ rlen2
) >= LONG_MAX
- size
) {
10235 rb_raise(rb_eArgError
, "argument too big");
10238 res
= str_new0(rb_cString
, 0, len
, termlen
);
10239 p
= RSTRING_PTR(res
);
10241 memset(p
, *f
, llen
);
10245 while (llen
>= fclen
) {
10251 memcpy(p
, f
, llen2
);
10255 memcpy(p
, RSTRING_PTR(str
), size
);
10258 memset(p
, *f
, rlen
);
10262 while (rlen
>= fclen
) {
10268 memcpy(p
, f
, rlen2
);
10272 TERM_FILL(p
, termlen
);
10273 STR_SET_LEN(res
, p
-RSTRING_PTR(res
));
10274 rb_enc_associate(res
, enc
);
10276 cr
= ENC_CODERANGE_AND(cr
, ENC_CODERANGE(pad
));
10277 if (cr
!= ENC_CODERANGE_BROKEN
)
10278 ENC_CODERANGE_SET(res
, cr
);
10287 * str.ljust(integer, padstr=' ') -> new_str
10289 * If <i>integer</i> is greater than the length of <i>str</i>, returns a new
10290 * String of length <i>integer</i> with <i>str</i> left justified
10291 * and padded with <i>padstr</i>; otherwise, returns <i>str</i>.
10293 * "hello".ljust(4) #=> "hello"
10294 * "hello".ljust(20) #=> "hello "
10295 * "hello".ljust(20, '1234') #=> "hello123412341234123"
10299 rb_str_ljust(int argc
, VALUE
*argv
, VALUE str
)
10301 return rb_str_justify(argc
, argv
, str
, 'l');
10307 * str.rjust(integer, padstr=' ') -> new_str
10309 * If <i>integer</i> is greater than the length of <i>str</i>, returns a new
10310 * String of length <i>integer</i> with <i>str</i> right justified
10311 * and padded with <i>padstr</i>; otherwise, returns <i>str</i>.
10313 * "hello".rjust(4) #=> "hello"
10314 * "hello".rjust(20) #=> " hello"
10315 * "hello".rjust(20, '1234') #=> "123412341234123hello"
10319 rb_str_rjust(int argc
, VALUE
*argv
, VALUE str
)
10321 return rb_str_justify(argc
, argv
, str
, 'r');
10327 * str.center(width, padstr=' ') -> new_str
10329 * Centers +str+ in +width+. If +width+ is greater than the length of +str+,
10330 * returns a new String of length +width+ with +str+ centered and padded with
10331 * +padstr+; otherwise, returns +str+.
10333 * "hello".center(4) #=> "hello"
10334 * "hello".center(20) #=> " hello "
10335 * "hello".center(20, '123') #=> "1231231hello12312312"
10339 rb_str_center(int argc
, VALUE
*argv
, VALUE str
)
10341 return rb_str_justify(argc
, argv
, str
, 'c');
10346 * str.partition(sep) -> [head, sep, tail]
10347 * str.partition(regexp) -> [head, match, tail]
10349 * Searches <i>sep</i> or pattern (<i>regexp</i>) in the string
10350 * and returns the part before it, the match, and the part
10352 * If it is not found, returns two empty strings and <i>str</i>.
10354 * "hello".partition("l") #=> ["he", "l", "lo"]
10355 * "hello".partition("x") #=> ["hello", "", ""]
10356 * "hello".partition(/.l/) #=> ["h", "el", "lo"]
10360 rb_str_partition(VALUE str
, VALUE sep
)
10364 sep
= get_pat_quoted(sep
, 0);
10365 if (RB_TYPE_P(sep
, T_REGEXP
)) {
10366 if (rb_reg_search(sep
, str
, 0, 0) < 0) {
10369 VALUE match
= rb_backref_get();
10370 struct re_registers
*regs
= RMATCH_REGS(match
);
10373 sep
= rb_str_subseq(str
, pos
, END(0) - pos
);
10376 pos
= rb_str_index(str
, sep
, 0);
10377 if (pos
< 0) goto failed
;
10379 return rb_ary_new3(3, rb_str_subseq(str
, 0, pos
),
10381 rb_str_subseq(str
, pos
+RSTRING_LEN(sep
),
10382 RSTRING_LEN(str
)-pos
-RSTRING_LEN(sep
)));
10385 return rb_ary_new3(3, str_duplicate(rb_cString
, str
), str_new_empty_String(str
), str_new_empty_String(str
));
10390 * str.rpartition(sep) -> [head, sep, tail]
10391 * str.rpartition(regexp) -> [head, match, tail]
10393 * Searches <i>sep</i> or pattern (<i>regexp</i>) in the string from the end
10394 * of the string, and returns the part before it, the match, and the part
10396 * If it is not found, returns two empty strings and <i>str</i>.
10398 * "hello".rpartition("l") #=> ["hel", "l", "o"]
10399 * "hello".rpartition("x") #=> ["", "", "hello"]
10400 * "hello".rpartition(/.l/) #=> ["he", "ll", "o"]
10402 * The match from the end means starting at the possible last position, not
10403 * the last of longest matches.
10405 * "hello".rpartition(/l+/) #=> ["hel", "l", "o"]
10407 * To partition at the last longest match, needs to combine with
10408 * negative lookbehind.
10410 * "hello".rpartition(/(?<!l)l+/) #=> ["he", "ll", "o"]
10412 * Or String#partition with negative lookforward.
10414 * "hello".partition(/l+(?!.*l)/) #=> ["he", "ll", "o"]
10418 rb_str_rpartition(VALUE str
, VALUE sep
)
10420 long pos
= RSTRING_LEN(str
);
10422 sep
= get_pat_quoted(sep
, 0);
10423 if (RB_TYPE_P(sep
, T_REGEXP
)) {
10424 if (rb_reg_search(sep
, str
, pos
, 1) < 0) {
10427 VALUE match
= rb_backref_get();
10428 struct re_registers
*regs
= RMATCH_REGS(match
);
10431 sep
= rb_str_subseq(str
, pos
, END(0) - pos
);
10434 pos
= rb_str_sublen(str
, pos
);
10435 pos
= rb_str_rindex(str
, sep
, pos
);
10439 pos
= rb_str_offset(str
, pos
);
10442 return rb_ary_new3(3, rb_str_subseq(str
, 0, pos
),
10444 rb_str_subseq(str
, pos
+RSTRING_LEN(sep
),
10445 RSTRING_LEN(str
)-pos
-RSTRING_LEN(sep
)));
10447 return rb_ary_new3(3, str_new_empty_String(str
), str_new_empty_String(str
), str_duplicate(rb_cString
, str
));
10452 * str.start_with?([prefixes]+) -> true or false
10454 * Returns true if +str+ starts with one of the +prefixes+ given.
10455 * Each of the +prefixes+ should be a String or a Regexp.
10457 * "hello".start_with?("hell") #=> true
10458 * "hello".start_with?(/H/i) #=> true
10460 * # returns true if one of the prefixes matches.
10461 * "hello".start_with?("heaven", "hell") #=> true
10462 * "hello".start_with?("heaven", "paradise") #=> false
10466 rb_str_start_with(int argc
, VALUE
*argv
, VALUE str
)
10470 for (i
=0; i
<argc
; i
++) {
10471 VALUE tmp
= argv
[i
];
10472 if (RB_TYPE_P(tmp
, T_REGEXP
)) {
10473 if (rb_reg_start_with_p(tmp
, str
))
10478 rb_enc_check(str
, tmp
);
10479 if (RSTRING_LEN(str
) < RSTRING_LEN(tmp
)) continue;
10480 if (memcmp(RSTRING_PTR(str
), RSTRING_PTR(tmp
), RSTRING_LEN(tmp
)) == 0)
10489 * str.end_with?([suffixes]+) -> true or false
10491 * Returns true if +str+ ends with one of the +suffixes+ given.
10493 * "hello".end_with?("ello") #=> true
10495 * # returns true if one of the +suffixes+ matches.
10496 * "hello".end_with?("heaven", "ello") #=> true
10497 * "hello".end_with?("heaven", "paradise") #=> false
10501 rb_str_end_with(int argc
, VALUE
*argv
, VALUE str
)
10507 for (i
=0; i
<argc
; i
++) {
10508 VALUE tmp
= argv
[i
];
10511 enc
= rb_enc_check(str
, tmp
);
10512 if ((tlen
= RSTRING_LEN(tmp
)) == 0) return Qtrue
;
10513 if ((slen
= RSTRING_LEN(str
)) < tlen
) continue;
10514 p
= RSTRING_PTR(str
);
10517 if (rb_enc_left_char_head(p
, s
, e
, enc
) != s
)
10519 if (memcmp(s
, RSTRING_PTR(tmp
), RSTRING_LEN(tmp
)) == 0)
10526 * Returns the length of the <i>prefix</i> to be deleted in the given <i>str</i>,
10527 * returning 0 if <i>str</i> does not start with the <i>prefix</i>.
10529 * @param str the target
10530 * @param prefix the prefix
10531 * @retval 0 if the given <i>str</i> does not start with the given <i>prefix</i>
10532 * @retval Positive-Integer otherwise
10535 deleted_prefix_length(VALUE str
, VALUE prefix
)
10537 char *strptr
, *prefixptr
;
10538 long olen
, prefixlen
;
10540 StringValue(prefix
);
10541 if (is_broken_string(prefix
)) return 0;
10542 rb_enc_check(str
, prefix
);
10544 /* return 0 if not start with prefix */
10545 prefixlen
= RSTRING_LEN(prefix
);
10546 if (prefixlen
<= 0) return 0;
10547 olen
= RSTRING_LEN(str
);
10548 if (olen
< prefixlen
) return 0;
10549 strptr
= RSTRING_PTR(str
);
10550 prefixptr
= RSTRING_PTR(prefix
);
10551 if (memcmp(strptr
, prefixptr
, prefixlen
) != 0) return 0;
10558 * str.delete_prefix!(prefix) -> self or nil
10560 * Deletes leading <code>prefix</code> from <i>str</i>, returning
10561 * <code>nil</code> if no change was made.
10563 * "hello".delete_prefix!("hel") #=> "lo"
10564 * "hello".delete_prefix!("llo") #=> nil
10568 rb_str_delete_prefix_bang(VALUE str
, VALUE prefix
)
10571 str_modify_keep_cr(str
);
10573 prefixlen
= deleted_prefix_length(str
, prefix
);
10574 if (prefixlen
<= 0) return Qnil
;
10576 return rb_str_drop_bytes(str
, prefixlen
);
10581 * str.delete_prefix(prefix) -> new_str
10583 * Returns a copy of <i>str</i> with leading <code>prefix</code> deleted.
10585 * "hello".delete_prefix("hel") #=> "lo"
10586 * "hello".delete_prefix("llo") #=> "hello"
10590 rb_str_delete_prefix(VALUE str
, VALUE prefix
)
10594 prefixlen
= deleted_prefix_length(str
, prefix
);
10595 if (prefixlen
<= 0) return str_duplicate(rb_cString
, str
);
10597 return rb_str_subseq(str
, prefixlen
, RSTRING_LEN(str
) - prefixlen
);
10601 * Returns the length of the <i>suffix</i> to be deleted in the given <i>str</i>,
10602 * returning 0 if <i>str</i> does not end with the <i>suffix</i>.
10604 * @param str the target
10605 * @param suffix the suffix
10606 * @retval 0 if the given <i>str</i> does not end with the given <i>suffix</i>
10607 * @retval Positive-Integer otherwise
10610 deleted_suffix_length(VALUE str
, VALUE suffix
)
10612 char *strptr
, *suffixptr
, *s
;
10613 long olen
, suffixlen
;
10616 StringValue(suffix
);
10617 if (is_broken_string(suffix
)) return 0;
10618 enc
= rb_enc_check(str
, suffix
);
10620 /* return 0 if not start with suffix */
10621 suffixlen
= RSTRING_LEN(suffix
);
10622 if (suffixlen
<= 0) return 0;
10623 olen
= RSTRING_LEN(str
);
10624 if (olen
< suffixlen
) return 0;
10625 strptr
= RSTRING_PTR(str
);
10626 suffixptr
= RSTRING_PTR(suffix
);
10627 s
= strptr
+ olen
- suffixlen
;
10628 if (memcmp(s
, suffixptr
, suffixlen
) != 0) return 0;
10629 if (rb_enc_left_char_head(strptr
, s
, strptr
+ olen
, enc
) != s
) return 0;
10636 * str.delete_suffix!(suffix) -> self or nil
10638 * Deletes trailing <code>suffix</code> from <i>str</i>, returning
10639 * <code>nil</code> if no change was made.
10641 * "hello".delete_suffix!("llo") #=> "he"
10642 * "hello".delete_suffix!("hel") #=> nil
10646 rb_str_delete_suffix_bang(VALUE str
, VALUE suffix
)
10648 long olen
, suffixlen
, len
;
10649 str_modifiable(str
);
10651 suffixlen
= deleted_suffix_length(str
, suffix
);
10652 if (suffixlen
<= 0) return Qnil
;
10654 olen
= RSTRING_LEN(str
);
10655 str_modify_keep_cr(str
);
10656 len
= olen
- suffixlen
;
10657 STR_SET_LEN(str
, len
);
10658 TERM_FILL(&RSTRING_PTR(str
)[len
], TERM_LEN(str
));
10659 if (ENC_CODERANGE(str
) != ENC_CODERANGE_7BIT
) {
10660 ENC_CODERANGE_CLEAR(str
);
10667 * str.delete_suffix(suffix) -> new_str
10669 * Returns a copy of <i>str</i> with trailing <code>suffix</code> deleted.
10671 * "hello".delete_suffix("llo") #=> "he"
10672 * "hello".delete_suffix("hel") #=> "hello"
10676 rb_str_delete_suffix(VALUE str
, VALUE suffix
)
10680 suffixlen
= deleted_suffix_length(str
, suffix
);
10681 if (suffixlen
<= 0) return str_duplicate(rb_cString
, str
);
10683 return rb_str_subseq(str
, 0, RSTRING_LEN(str
) - suffixlen
);
10687 rb_str_setter(VALUE val
, ID id
, VALUE
*var
)
10689 if (!NIL_P(val
) && !RB_TYPE_P(val
, T_STRING
)) {
10690 rb_raise(rb_eTypeError
, "value of %"PRIsVALUE
" must be String", rb_id2str(id
));
10696 rb_fs_setter(VALUE val
, ID id
, VALUE
*var
)
10698 val
= rb_fs_check(val
);
10700 rb_raise(rb_eTypeError
,
10701 "value of %"PRIsVALUE
" must be String or Regexp",
10705 rb_warn_deprecated("`$;'", NULL
);
10713 * str.force_encoding(encoding) -> str
10715 * Changes the encoding to +encoding+ and returns self.
10719 rb_str_force_encoding(VALUE str
, VALUE enc
)
10721 str_modifiable(str
);
10722 rb_enc_associate(str
, rb_to_encoding(enc
));
10723 ENC_CODERANGE_CLEAR(str
);
10731 * Returns a copied string whose encoding is ASCII-8BIT.
10735 rb_str_b(VALUE str
)
10738 if (FL_TEST(str
, STR_NOEMBED
)) {
10739 str2
= str_alloc_heap(rb_cString
);
10742 str2
= str_alloc_embed(rb_cString
, RSTRING_EMBED_LEN(str
) + TERM_LEN(str
));
10744 str_replace_shared_without_enc(str2
, str
);
10745 ENC_CODERANGE_CLEAR(str2
);
10751 * str.valid_encoding? -> true or false
10753 * Returns true for a string which is encoded correctly.
10755 * "\xc2\xa1".force_encoding("UTF-8").valid_encoding? #=> true
10756 * "\xc2".force_encoding("UTF-8").valid_encoding? #=> false
10757 * "\x80".force_encoding("UTF-8").valid_encoding? #=> false
10761 rb_str_valid_encoding_p(VALUE str
)
10763 int cr
= rb_enc_str_coderange(str
);
10765 return RBOOL(cr
!= ENC_CODERANGE_BROKEN
);
10770 * str.ascii_only? -> true or false
10772 * Returns true for a string which has only ASCII characters.
10774 * "abc".force_encoding("UTF-8").ascii_only? #=> true
10775 * "abc\u{6666}".force_encoding("UTF-8").ascii_only? #=> false
10779 rb_str_is_ascii_only_p(VALUE str
)
10781 int cr
= rb_enc_str_coderange(str
);
10783 return RBOOL(cr
== ENC_CODERANGE_7BIT
);
10787 rb_str_ellipsize(VALUE str
, long len
)
10789 static const char ellipsis
[] = "...";
10790 const long ellipsislen
= sizeof(ellipsis
) - 1;
10791 rb_encoding
*const enc
= rb_enc_get(str
);
10792 const long blen
= RSTRING_LEN(str
);
10793 const char *const p
= RSTRING_PTR(str
), *e
= p
+ blen
;
10794 VALUE estr
, ret
= 0;
10796 if (len
< 0) rb_raise(rb_eIndexError
, "negative length %ld", len
);
10797 if (len
* rb_enc_mbminlen(enc
) >= blen
||
10798 (e
= rb_enc_nth(p
, e
, len
, enc
)) - p
== blen
) {
10801 else if (len
<= ellipsislen
||
10802 !(e
= rb_enc_step_back(p
, e
, e
, len
= ellipsislen
, enc
))) {
10803 if (rb_enc_asciicompat(enc
)) {
10804 ret
= rb_str_new(ellipsis
, len
);
10805 rb_enc_associate(ret
, enc
);
10808 estr
= rb_usascii_str_new(ellipsis
, len
);
10809 ret
= rb_str_encode(estr
, rb_enc_from_encoding(enc
), 0, Qnil
);
10812 else if (ret
= rb_str_subseq(str
, 0, e
- p
), rb_enc_asciicompat(enc
)) {
10813 rb_str_cat(ret
, ellipsis
, ellipsislen
);
10816 estr
= rb_str_encode(rb_usascii_str_new(ellipsis
, ellipsislen
),
10817 rb_enc_from_encoding(enc
), 0, Qnil
);
10818 rb_str_append(ret
, estr
);
10824 str_compat_and_valid(VALUE str
, rb_encoding
*enc
)
10827 str
= StringValue(str
);
10828 cr
= rb_enc_str_coderange(str
);
10829 if (cr
== ENC_CODERANGE_BROKEN
) {
10830 rb_raise(rb_eArgError
, "replacement must be valid byte sequence '%+"PRIsVALUE
"'", str
);
10833 rb_encoding
*e
= STR_ENC_GET(str
);
10834 if (cr
== ENC_CODERANGE_7BIT
? rb_enc_mbminlen(enc
) != 1 : enc
!= e
) {
10835 rb_raise(rb_eEncCompatError
, "incompatible character encodings: %s and %s",
10836 rb_enc_name(enc
), rb_enc_name(e
));
10842 static VALUE
enc_str_scrub(rb_encoding
*enc
, VALUE str
, VALUE repl
, int cr
);
10845 rb_str_scrub(VALUE str
, VALUE repl
)
10847 rb_encoding
*enc
= STR_ENC_GET(str
);
10848 return enc_str_scrub(enc
, str
, repl
, ENC_CODERANGE(str
));
10852 rb_enc_str_scrub(rb_encoding
*enc
, VALUE str
, VALUE repl
)
10854 int cr
= ENC_CODERANGE_UNKNOWN
;
10855 if (enc
== STR_ENC_GET(str
)) {
10856 /* cached coderange makes sense only when enc equals the
10857 * actual encoding of str */
10858 cr
= ENC_CODERANGE(str
);
10860 return enc_str_scrub(enc
, str
, repl
, cr
);
10864 enc_str_scrub(rb_encoding
*enc
, VALUE str
, VALUE repl
, int cr
)
10868 const char *rep
, *p
, *e
, *p1
, *sp
;
10872 if (rb_block_given_p()) {
10874 rb_raise(rb_eArgError
, "both of block and replacement given");
10878 if (ENC_CODERANGE_CLEAN_P(cr
))
10881 if (!NIL_P(repl
)) {
10882 repl
= str_compat_and_valid(repl
, enc
);
10885 if (rb_enc_dummy_p(enc
)) {
10888 encidx
= rb_enc_to_index(enc
);
10890 #define DEFAULT_REPLACE_CHAR(str) do { \
10891 static const char replace[sizeof(str)-1] = str; \
10892 rep = replace; replen = (int)sizeof(replace); \
10895 slen
= RSTRING_LEN(str
);
10896 p
= RSTRING_PTR(str
);
10897 e
= RSTRING_END(str
);
10901 if (rb_enc_asciicompat(enc
)) {
10907 else if (!NIL_P(repl
)) {
10908 rep
= RSTRING_PTR(repl
);
10909 replen
= RSTRING_LEN(repl
);
10910 rep7bit_p
= (ENC_CODERANGE(repl
) == ENC_CODERANGE_7BIT
);
10912 else if (encidx
== rb_utf8_encindex()) {
10913 DEFAULT_REPLACE_CHAR("\xEF\xBF\xBD");
10917 DEFAULT_REPLACE_CHAR("?");
10920 cr
= ENC_CODERANGE_7BIT
;
10922 p
= search_nonascii(p
, e
);
10927 int ret
= rb_enc_precise_mbclen(p
, e
, enc
);
10928 if (MBCLEN_NEEDMORE_P(ret
)) {
10931 else if (MBCLEN_CHARFOUND_P(ret
)) {
10932 cr
= ENC_CODERANGE_VALID
;
10933 p
+= MBCLEN_CHARFOUND_LEN(ret
);
10935 else if (MBCLEN_INVALID_P(ret
)) {
10937 * p1~p: valid ascii/multibyte chars
10938 * p ~e: invalid bytes + unknown bytes
10940 long clen
= rb_enc_mbmaxlen(enc
);
10941 if (NIL_P(buf
)) buf
= rb_str_buf_new(RSTRING_LEN(str
));
10943 rb_str_buf_cat(buf
, p1
, p
- p1
);
10946 if (e
- p
< clen
) clen
= e
- p
;
10953 for (; clen
> 1; clen
--) {
10954 ret
= rb_enc_precise_mbclen(q
, q
+ clen
, enc
);
10955 if (MBCLEN_NEEDMORE_P(ret
)) break;
10956 if (MBCLEN_INVALID_P(ret
)) continue;
10961 rb_str_buf_cat(buf
, rep
, replen
);
10962 if (!rep7bit_p
) cr
= ENC_CODERANGE_VALID
;
10965 repl
= rb_yield(rb_enc_str_new(p
, clen
, enc
));
10966 str_mod_check(str
, sp
, slen
);
10967 repl
= str_compat_and_valid(repl
, enc
);
10968 rb_str_buf_cat(buf
, RSTRING_PTR(repl
), RSTRING_LEN(repl
));
10969 if (ENC_CODERANGE(repl
) == ENC_CODERANGE_VALID
)
10970 cr
= ENC_CODERANGE_VALID
;
10974 p
= search_nonascii(p
, e
);
10986 ENC_CODERANGE_SET(str
, cr
);
10989 buf
= rb_str_buf_new(RSTRING_LEN(str
));
10992 rb_str_buf_cat(buf
, p1
, p
- p1
);
10996 rb_str_buf_cat(buf
, rep
, replen
);
10997 if (!rep7bit_p
) cr
= ENC_CODERANGE_VALID
;
11000 repl
= rb_yield(rb_enc_str_new(p
, e
-p
, enc
));
11001 str_mod_check(str
, sp
, slen
);
11002 repl
= str_compat_and_valid(repl
, enc
);
11003 rb_str_buf_cat(buf
, RSTRING_PTR(repl
), RSTRING_LEN(repl
));
11004 if (ENC_CODERANGE(repl
) == ENC_CODERANGE_VALID
)
11005 cr
= ENC_CODERANGE_VALID
;
11010 /* ASCII incompatible */
11011 long mbminlen
= rb_enc_mbminlen(enc
);
11015 else if (!NIL_P(repl
)) {
11016 rep
= RSTRING_PTR(repl
);
11017 replen
= RSTRING_LEN(repl
);
11019 else if (encidx
== ENCINDEX_UTF_16BE
) {
11020 DEFAULT_REPLACE_CHAR("\xFF\xFD");
11022 else if (encidx
== ENCINDEX_UTF_16LE
) {
11023 DEFAULT_REPLACE_CHAR("\xFD\xFF");
11025 else if (encidx
== ENCINDEX_UTF_32BE
) {
11026 DEFAULT_REPLACE_CHAR("\x00\x00\xFF\xFD");
11028 else if (encidx
== ENCINDEX_UTF_32LE
) {
11029 DEFAULT_REPLACE_CHAR("\xFD\xFF\x00\x00");
11032 DEFAULT_REPLACE_CHAR("?");
11036 int ret
= rb_enc_precise_mbclen(p
, e
, enc
);
11037 if (MBCLEN_NEEDMORE_P(ret
)) {
11040 else if (MBCLEN_CHARFOUND_P(ret
)) {
11041 p
+= MBCLEN_CHARFOUND_LEN(ret
);
11043 else if (MBCLEN_INVALID_P(ret
)) {
11045 long clen
= rb_enc_mbmaxlen(enc
);
11046 if (NIL_P(buf
)) buf
= rb_str_buf_new(RSTRING_LEN(str
));
11047 if (p
> p1
) rb_str_buf_cat(buf
, p1
, p
- p1
);
11049 if (e
- p
< clen
) clen
= e
- p
;
11050 if (clen
<= mbminlen
* 2) {
11055 for (; clen
> mbminlen
; clen
-=mbminlen
) {
11056 ret
= rb_enc_precise_mbclen(q
, q
+ clen
, enc
);
11057 if (MBCLEN_NEEDMORE_P(ret
)) break;
11058 if (MBCLEN_INVALID_P(ret
)) continue;
11063 rb_str_buf_cat(buf
, rep
, replen
);
11066 repl
= rb_yield(rb_enc_str_new(p
, clen
, enc
));
11067 str_mod_check(str
, sp
, slen
);
11068 repl
= str_compat_and_valid(repl
, enc
);
11069 rb_str_buf_cat(buf
, RSTRING_PTR(repl
), RSTRING_LEN(repl
));
11080 ENC_CODERANGE_SET(str
, ENC_CODERANGE_VALID
);
11083 buf
= rb_str_buf_new(RSTRING_LEN(str
));
11086 rb_str_buf_cat(buf
, p1
, p
- p1
);
11090 rb_str_buf_cat(buf
, rep
, replen
);
11093 repl
= rb_yield(rb_enc_str_new(p
, e
-p
, enc
));
11094 str_mod_check(str
, sp
, slen
);
11095 repl
= str_compat_and_valid(repl
, enc
);
11096 rb_str_buf_cat(buf
, RSTRING_PTR(repl
), RSTRING_LEN(repl
));
11099 cr
= ENC_CODERANGE_VALID
;
11101 ENCODING_CODERANGE_SET(buf
, rb_enc_to_index(enc
), cr
);
11107 * str.scrub -> new_str
11108 * str.scrub(repl) -> new_str
11109 * str.scrub{|bytes|} -> new_str
11111 * If the string is invalid byte sequence then replace invalid bytes with given replacement
11112 * character, else returns self.
11113 * If block is given, replace invalid bytes with returned value of the block.
11115 * "abc\u3042\x81".scrub #=> "abc\u3042\uFFFD"
11116 * "abc\u3042\x81".scrub("*") #=> "abc\u3042*"
11117 * "abc\u3042\xE3\x80".scrub{|bytes| '<'+bytes.unpack1('H*')+'>' } #=> "abc\u3042<e380>"
11120 str_scrub(int argc
, VALUE
*argv
, VALUE str
)
11122 VALUE repl
= argc
? (rb_check_arity(argc
, 0, 1), argv
[0]) : Qnil
;
11123 VALUE
new = rb_str_scrub(str
, repl
);
11124 return NIL_P(new) ? str_duplicate(rb_cString
, str
): new;
11129 * str.scrub! -> str
11130 * str.scrub!(repl) -> str
11131 * str.scrub!{|bytes|} -> str
11133 * If the string is invalid byte sequence then replace invalid bytes with given replacement
11134 * character, else returns self.
11135 * If block is given, replace invalid bytes with returned value of the block.
11137 * "abc\u3042\x81".scrub! #=> "abc\u3042\uFFFD"
11138 * "abc\u3042\x81".scrub!("*") #=> "abc\u3042*"
11139 * "abc\u3042\xE3\x80".scrub!{|bytes| '<'+bytes.unpack1('H*')+'>' } #=> "abc\u3042<e380>"
11142 str_scrub_bang(int argc
, VALUE
*argv
, VALUE str
)
11144 VALUE repl
= argc
? (rb_check_arity(argc
, 0, 1), argv
[0]) : Qnil
;
11145 VALUE
new = rb_str_scrub(str
, repl
);
11146 if (!NIL_P(new)) rb_str_replace(str
, new);
11150 static ID id_normalize
;
11151 static ID id_normalized_p
;
11152 static VALUE mUnicodeNormalize
;
11155 unicode_normalize_common(int argc
, VALUE
*argv
, VALUE str
, ID id
)
11157 static int UnicodeNormalizeRequired
= 0;
11160 if (!UnicodeNormalizeRequired
) {
11161 rb_require("unicode_normalize/normalize.rb");
11162 UnicodeNormalizeRequired
= 1;
11165 if (rb_check_arity(argc
, 0, 1)) argv2
[1] = argv
[0];
11166 return rb_funcallv(mUnicodeNormalize
, id
, argc
+1, argv2
);
11171 * str.unicode_normalize(form=:nfc)
11173 * Unicode Normalization---Returns a normalized form of +str+,
11174 * using Unicode normalizations NFC, NFD, NFKC, or NFKD.
11175 * The normalization form used is determined by +form+, which can
11176 * be any of the four values +:nfc+, +:nfd+, +:nfkc+, or +:nfkd+.
11177 * The default is +:nfc+.
11179 * If the string is not in a Unicode Encoding, then an Exception is raised.
11180 * In this context, 'Unicode Encoding' means any of UTF-8, UTF-16BE/LE,
11181 * and UTF-32BE/LE, as well as GB18030, UCS_2BE, and UCS_4BE.
11182 * Anything other than UTF-8 is implemented by converting to UTF-8,
11183 * which makes it slower than UTF-8.
11185 * "a\u0300".unicode_normalize #=> "\u00E0"
11186 * "a\u0300".unicode_normalize(:nfc) #=> "\u00E0"
11187 * "\u00E0".unicode_normalize(:nfd) #=> "a\u0300"
11188 * "\xE0".force_encoding('ISO-8859-1').unicode_normalize(:nfd)
11189 * #=> Encoding::CompatibilityError raised
11192 rb_str_unicode_normalize(int argc
, VALUE
*argv
, VALUE str
)
11194 return unicode_normalize_common(argc
, argv
, str
, id_normalize
);
11199 * str.unicode_normalize!(form=:nfc)
11201 * Destructive version of String#unicode_normalize, doing Unicode
11202 * normalization in place.
11205 rb_str_unicode_normalize_bang(int argc
, VALUE
*argv
, VALUE str
)
11207 return rb_str_replace(str
, unicode_normalize_common(argc
, argv
, str
, id_normalize
));
11211 * str.unicode_normalized?(form=:nfc)
11213 * Checks whether +str+ is in Unicode normalization form +form+,
11214 * which can be any of the four values +:nfc+, +:nfd+, +:nfkc+, or +:nfkd+.
11215 * The default is +:nfc+.
11217 * If the string is not in a Unicode Encoding, then an Exception is raised.
11218 * For details, see String#unicode_normalize.
11220 * "a\u0300".unicode_normalized? #=> false
11221 * "a\u0300".unicode_normalized?(:nfd) #=> true
11222 * "\u00E0".unicode_normalized? #=> true
11223 * "\u00E0".unicode_normalized?(:nfd) #=> false
11224 * "\xE0".force_encoding('ISO-8859-1').unicode_normalized?
11225 * #=> Encoding::CompatibilityError raised
11228 rb_str_unicode_normalized_p(int argc
, VALUE
*argv
, VALUE str
)
11230 return unicode_normalize_common(argc
, argv
, str
, id_normalized_p
);
11233 /**********************************************************************
11234 * Document-class: Symbol
11236 * Symbol objects represent named identifiers inside the Ruby interpreter.
11238 * You can create a \Symbol object explicitly with:
11240 * - A {symbol literal}[doc/syntax/literals_rdoc.html#label-Symbol+Literals].
11242 * The same Symbol object will be
11243 * created for a given name or string for the duration of a program's
11244 * execution, regardless of the context or meaning of that name. Thus
11245 * if <code>Fred</code> is a constant in one context, a method in
11246 * another, and a class in a third, the Symbol <code>:Fred</code>
11247 * will be the same object in all three contexts.
11261 * $f1.object_id #=> 2514190
11262 * $f2.object_id #=> 2514190
11263 * $f3.object_id #=> 2514190
11265 * Constant, method, and variable names are returned as symbols:
11278 * One.instance_methods(true)
11280 * One.instance_variables
11282 * One.class_variables
11284 * global_variables.grep(/six/)
11289 * Symbol objects are different from String objects in that
11290 * Symbol objects represent identifiers, while String objects
11291 * represent text or data.
11295 * First, what's elsewhere. \Class \Symbol:
11297 * - Inherits from {class Object}[Object.html#class-Object-label-What-27s+Here].
11298 * - Includes {module Comparable}[Comparable.html#module-Comparable-label-What-27s+Here].
11300 * Here, class \Symbol provides methods that are useful for:
11302 * - {Querying}[#class-Symbol-label-Methods+for+Querying]
11303 * - {Comparing}[#class-Symbol-label-Methods+for+Comparing]
11304 * - {Converting}[#class-Symbol-label-Methods+for+Converting]
11306 * === Methods for Querying
11308 * - ::all_symbols:: Returns an array of the symbols currently in Ruby's symbol table.
11309 * - {#=~}[#method-i-3D~]:: Returns the index of the first substring
11310 * in symbol that matches a given Regexp
11311 * or other object; returns +nil+ if no match is found.
11312 * - #[], #slice :: Returns a substring of symbol
11313 * determined by a given index, start/length, or range, or string.
11314 * - #empty?:: Returns +true+ if +self.length+ is zero; +false+ otherwise.
11315 * - #encoding:: Returns the Encoding object that represents the encoding
11317 * - #end_with?:: Returns +true+ if symbol ends with
11318 * any of the given strings.
11319 * - #match:: Returns a MatchData object if symbol
11320 * matches a given Regexp; +nil+ otherwise.
11321 * - #match?:: Returns +true+ if symbol
11322 * matches a given Regexp; +false+ otherwise.
11323 * - #length, #size:: Returns the number of characters in symbol.
11324 * - #start_with?:: Returns +true+ if symbol starts with
11325 * any of the given strings.
11327 * === Methods for Comparing
11329 * - {#<=>}[#method-i-3C-3D-3E]:: Returns -1, 0, or 1 as a given symbol is smaller than, equal to, or larger than symbol.
11330 * - {#==, #===}[#method-i-3D-3D]:: Returns +true+ if a given symbol
11331 * has the same content and encoding.
11332 * - #casecmp:: Ignoring case, returns -1, 0, or 1 as a given
11333 * symbol is smaller than, equal to, or larger than symbol.
11334 * - #casecmp?:: Returns +true+ if symbol is equal to a given symbol
11335 * after Unicode case folding; +false+ otherwise.
11337 * === Methods for Converting
11339 * - #capitalize:: Returns symbol with the first character upcased
11340 * and all other characters downcased.
11341 * - #downcase:: Returns symbol with all characters downcased.
11342 * - #inspect:: Returns the string representation of +self+ as a symbol literal.
11343 * - #name:: Returns the frozen string corresponding to symbol.
11344 * - #succ, #next:: Returns the symbol that is the successor to symbol.
11345 * - #swapcase:: Returns symbol with all upcase characters downcased
11346 * and all downcase characters upcased.
11347 * - #to_proc:: Returns a Proc object which responds to the method named by symbol.
11348 * - #to_s, #id2name:: Returns the string corresponding to +self+.
11349 * - #to_sym, #intern:: Returns +self+.
11350 * - #upcase:: Returns symbol with all characters upcased.
11357 * sym == obj -> true or false
11359 * Equality---If <i>sym</i> and <i>obj</i> are exactly the same
11360 * symbol, returns <code>true</code>.
11363 #define sym_equal rb_obj_equal
11366 sym_printable(const char *s
, const char *send
, rb_encoding
*enc
)
11370 int c
= rb_enc_precise_mbclen(s
, send
, enc
);
11372 if (!MBCLEN_CHARFOUND_P(c
)) return FALSE
;
11373 n
= MBCLEN_CHARFOUND_LEN(c
);
11374 c
= rb_enc_mbc_to_codepoint(s
, send
, enc
);
11375 if (!rb_enc_isprint(c
, enc
)) return FALSE
;
11382 rb_str_symname_p(VALUE sym
)
11387 rb_encoding
*resenc
= rb_default_internal_encoding();
11389 if (resenc
== NULL
) resenc
= rb_default_external_encoding();
11390 enc
= STR_ENC_GET(sym
);
11391 ptr
= RSTRING_PTR(sym
);
11392 len
= RSTRING_LEN(sym
);
11393 if ((resenc
!= enc
&& !rb_str_is_ascii_only_p(sym
)) || len
!= (long)strlen(ptr
) ||
11394 !rb_enc_symname2_p(ptr
, len
, enc
) || !sym_printable(ptr
, ptr
+ len
, enc
)) {
11401 rb_str_quote_unprintable(VALUE str
)
11406 rb_encoding
*resenc
;
11408 Check_Type(str
, T_STRING
);
11409 resenc
= rb_default_internal_encoding();
11410 if (resenc
== NULL
) resenc
= rb_default_external_encoding();
11411 enc
= STR_ENC_GET(str
);
11412 ptr
= RSTRING_PTR(str
);
11413 len
= RSTRING_LEN(str
);
11414 if ((resenc
!= enc
&& !rb_str_is_ascii_only_p(str
)) ||
11415 !sym_printable(ptr
, ptr
+ len
, enc
)) {
11416 return rb_str_escape(str
);
11421 MJIT_FUNC_EXPORTED VALUE
11422 rb_id_quote_unprintable(ID id
)
11424 VALUE str
= rb_id2str(id
);
11425 if (!rb_str_symname_p(str
)) {
11426 return rb_str_escape(str
);
11433 * sym.inspect -> string
11435 * Returns the representation of <i>sym</i> as a symbol literal.
11437 * :fred.inspect #=> ":fred"
11441 sym_inspect(VALUE sym
)
11443 VALUE str
= rb_sym2str(sym
);
11448 if (!rb_str_symname_p(str
)) {
11449 str
= rb_str_inspect(str
);
11450 len
= RSTRING_LEN(str
);
11451 rb_str_resize(str
, len
+ 1);
11452 dest
= RSTRING_PTR(str
);
11453 memmove(dest
+ 1, dest
, len
);
11456 rb_encoding
*enc
= STR_ENC_GET(str
);
11457 RSTRING_GETMEM(str
, ptr
, len
);
11458 str
= rb_enc_str_new(0, len
+ 1, enc
);
11459 dest
= RSTRING_PTR(str
);
11460 memcpy(dest
+ 1, ptr
, len
);
11466 #if 0 /* for RDoc */
11469 * sym.name -> string
11471 * Returns the name or string corresponding to <i>sym</i>. Unlike #to_s, the
11472 * returned string is frozen.
11474 * :fred.name #=> "fred"
11475 * :fred.name.frozen? #=> true
11476 * :fred.to_s #=> "fred"
11477 * :fred.to_s.frozen? #=> false
11480 rb_sym2str(VALUE sym
)
11489 * sym.id2name -> string
11490 * sym.to_s -> string
11492 * Returns the name or string corresponding to <i>sym</i>.
11494 * :fred.id2name #=> "fred"
11495 * :ginger.to_s #=> "ginger"
11497 * Note that this string is not frozen (unlike the symbol itself).
11498 * To get a frozen string, use #name.
11503 rb_sym_to_s(VALUE sym
)
11505 return str_new_shared(rb_cString
, rb_sym2str(sym
));
11511 * sym.to_sym -> sym
11512 * sym.intern -> sym
11514 * In general, <code>to_sym</code> returns the Symbol corresponding
11515 * to an object. As <i>sym</i> is already a symbol, <code>self</code> is returned
11520 sym_to_sym(VALUE sym
)
11525 MJIT_FUNC_EXPORTED VALUE
11526 rb_sym_proc_call(ID mid
, int argc
, const VALUE
*argv
, int kw_splat
, VALUE passed_proc
)
11531 rb_raise(rb_eArgError
, "no receiver given");
11534 return rb_funcall_with_block_kw(obj
, mid
, argc
- 1, argv
+ 1, passed_proc
, kw_splat
);
11542 * Returns a _Proc_ object which responds to the given method by _sym_.
11544 * (1..3).collect(&:to_s) #=> ["1", "2", "3"]
11548 rb_sym_to_proc(VALUE sym
)
11558 * Same as <code>sym.to_s.succ.intern</code>.
11562 sym_succ(VALUE sym
)
11564 return rb_str_intern(rb_str_succ(rb_sym2str(sym
)));
11570 * symbol <=> other_symbol -> -1, 0, +1, or nil
11572 * Compares +symbol+ with +other_symbol+ after calling #to_s on each of the
11573 * symbols. Returns -1, 0, +1, or +nil+ depending on whether +symbol+ is
11574 * less than, equal to, or greater than +other_symbol+.
11576 * +nil+ is returned if the two values are incomparable.
11578 * See String#<=> for more information.
11582 sym_cmp(VALUE sym
, VALUE other
)
11584 if (!SYMBOL_P(other
)) {
11587 return rb_str_cmp_m(rb_sym2str(sym
), rb_sym2str(other
));
11592 * casecmp(other_symbol) -> -1, 0, 1, or nil
11594 * Case-insensitive version of {Symbol#<=>}[#method-i-3C-3D-3E]:
11596 * :aBcDeF.casecmp(:abcde) # => 1
11597 * :aBcDeF.casecmp(:abcdef) # => 0
11598 * :aBcDeF.casecmp(:abcdefg) # => -1
11599 * :abcdef.casecmp(:ABCDEF) # => 0
11601 * Returns +nil+ if the two symbols have incompatible encodings,
11602 * or if +other_symbol+ is not a symbol:
11604 * sym = "\u{e4 f6 fc}".encode("ISO-8859-1").to_sym
11605 * other_sym = :"\u{c4 d6 dc}"
11606 * sym.casecmp(other_sym) # => nil
11607 * :foo.casecmp(2) # => nil
11609 * Currently, case-insensitivity only works on characters A-Z/a-z,
11610 * not all of Unicode. This is different from Symbol#casecmp?.
11612 * Related: Symbol#casecmp?.
11617 sym_casecmp(VALUE sym
, VALUE other
)
11619 if (!SYMBOL_P(other
)) {
11622 return str_casecmp(rb_sym2str(sym
), rb_sym2str(other
));
11627 * casecmp?(other_symbol) -> true, false, or nil
11629 * Returns +true+ if +sym+ and +other_symbol+ are equal after
11630 * Unicode case folding, +false+ if they are not equal:
11632 * :aBcDeF.casecmp?(:abcde) # => false
11633 * :aBcDeF.casecmp?(:abcdef) # => true
11634 * :aBcDeF.casecmp?(:abcdefg) # => false
11635 * :abcdef.casecmp?(:ABCDEF) # => true
11636 * :"\u{e4 f6 fc}".casecmp?(:"\u{c4 d6 dc}") #=> true
11638 * Returns +nil+ if the two symbols have incompatible encodings,
11639 * or if +other_symbol+ is not a symbol:
11641 * sym = "\u{e4 f6 fc}".encode("ISO-8859-1").to_sym
11642 * other_sym = :"\u{c4 d6 dc}"
11643 * sym.casecmp?(other_sym) # => nil
11644 * :foo.casecmp?(2) # => nil
11646 * See {Case Mapping}[doc/case_mapping_rdoc.html].
11648 * Related: Symbol#casecmp.
11653 sym_casecmp_p(VALUE sym
, VALUE other
)
11655 if (!SYMBOL_P(other
)) {
11658 return str_casecmp_p(rb_sym2str(sym
), rb_sym2str(other
));
11663 * sym =~ obj -> integer or nil
11665 * Returns <code>sym.to_s =~ obj</code>.
11669 sym_match(VALUE sym
, VALUE other
)
11671 return rb_str_match(rb_sym2str(sym
), other
);
11676 * sym.match(pattern) -> matchdata or nil
11677 * sym.match(pattern, pos) -> matchdata or nil
11679 * Returns <code>sym.to_s.match</code>.
11683 sym_match_m(int argc
, VALUE
*argv
, VALUE sym
)
11685 return rb_str_match_m(argc
, argv
, rb_sym2str(sym
));
11690 * sym.match?(pattern) -> true or false
11691 * sym.match?(pattern, pos) -> true or false
11693 * Returns <code>sym.to_s.match?</code>.
11697 sym_match_m_p(int argc
, VALUE
*argv
, VALUE sym
)
11699 return rb_str_match_m_p(argc
, argv
, sym
);
11705 * sym[b, n] -> string
11706 * sym.slice(idx) -> char
11707 * sym.slice(b, n) -> string
11709 * Returns <code>sym.to_s[]</code>.
11713 sym_aref(int argc
, VALUE
*argv
, VALUE sym
)
11715 return rb_str_aref_m(argc
, argv
, rb_sym2str(sym
));
11720 * sym.length -> integer
11721 * sym.size -> integer
11723 * Same as <code>sym.to_s.length</code>.
11727 sym_length(VALUE sym
)
11729 return rb_str_length(rb_sym2str(sym
));
11734 * sym.empty? -> true or false
11736 * Returns whether _sym_ is :"" or not.
11740 sym_empty(VALUE sym
)
11742 return rb_str_empty(rb_sym2str(sym
));
11747 * upcase(*options) -> symbol
11749 * Equivalent to <tt>sym.to_s.upcase.to_sym</tt>.
11751 * See String#upcase.
11756 sym_upcase(int argc
, VALUE
*argv
, VALUE sym
)
11758 return rb_str_intern(rb_str_upcase(argc
, argv
, rb_sym2str(sym
)));
11763 * downcase(*options) -> symbol
11765 * Equivalent to <tt>sym.to_s.downcase.to_sym</tt>.
11767 * See String#downcase.
11769 * Related: Symbol#upcase.
11774 sym_downcase(int argc
, VALUE
*argv
, VALUE sym
)
11776 return rb_str_intern(rb_str_downcase(argc
, argv
, rb_sym2str(sym
)));
11781 * capitalize(*options) -> symbol
11783 * Equivalent to <tt>sym.to_s.capitalize.to_sym</tt>.
11785 * See String#capitalize.
11790 sym_capitalize(int argc
, VALUE
*argv
, VALUE sym
)
11792 return rb_str_intern(rb_str_capitalize(argc
, argv
, rb_sym2str(sym
)));
11797 * swapcase(*options) -> symbol
11799 * Equivalent to <tt>sym.to_s.swapcase.to_sym</tt>.
11801 * See String#swapcase.
11806 sym_swapcase(int argc
, VALUE
*argv
, VALUE sym
)
11808 return rb_str_intern(rb_str_swapcase(argc
, argv
, rb_sym2str(sym
)));
11813 * sym.start_with?([prefixes]+) -> true or false
11815 * Returns true if +sym+ starts with one of the +prefixes+ given.
11816 * Each of the +prefixes+ should be a String or a Regexp.
11818 * :hello.start_with?("hell") #=> true
11819 * :hello.start_with?(/H/i) #=> true
11821 * # returns true if one of the prefixes matches.
11822 * :hello.start_with?("heaven", "hell") #=> true
11823 * :hello.start_with?("heaven", "paradise") #=> false
11827 sym_start_with(int argc
, VALUE
*argv
, VALUE sym
)
11829 return rb_str_start_with(argc
, argv
, rb_sym2str(sym
));
11834 * sym.end_with?([suffixes]+) -> true or false
11836 * Returns true if +sym+ ends with one of the +suffixes+ given.
11838 * :hello.end_with?("ello") #=> true
11840 * # returns true if one of the +suffixes+ matches.
11841 * :hello.end_with?("heaven", "ello") #=> true
11842 * :hello.end_with?("heaven", "paradise") #=> false
11846 sym_end_with(int argc
, VALUE
*argv
, VALUE sym
)
11848 return rb_str_end_with(argc
, argv
, rb_sym2str(sym
));
11853 * sym.encoding -> encoding
11855 * Returns the Encoding object that represents the encoding of _sym_.
11859 sym_encoding(VALUE sym
)
11861 return rb_obj_encoding(rb_sym2str(sym
));
11865 string_for_symbol(VALUE name
)
11867 if (!RB_TYPE_P(name
, T_STRING
)) {
11868 VALUE tmp
= rb_check_string_type(name
);
11870 rb_raise(rb_eTypeError
, "%+"PRIsVALUE
" is not a symbol",
11879 rb_to_id(VALUE name
)
11881 if (SYMBOL_P(name
)) {
11882 return SYM2ID(name
);
11884 name
= string_for_symbol(name
);
11885 return rb_intern_str(name
);
11889 rb_to_symbol(VALUE name
)
11891 if (SYMBOL_P(name
)) {
11894 name
= string_for_symbol(name
);
11895 return rb_str_intern(name
);
11900 * Symbol.all_symbols => array
11902 * Returns an array of all the symbols currently in Ruby's symbol
11905 * Symbol.all_symbols.size #=> 903
11906 * Symbol.all_symbols[1,20] #=> [:floor, :ARGV, :Binding, :symlink,
11907 * :chown, :EOFError, :$;, :String,
11908 * :LOCK_SH, :"setuid?", :$<,
11909 * :default_proc, :compact, :extend,
11910 * :Tms, :getwd, :$=, :ThreadGroup,
11915 sym_all_symbols(VALUE _
)
11917 return rb_sym_all_symbols();
11921 rb_str_to_interned_str(VALUE str
)
11923 return rb_fstring(str
);
11927 rb_interned_str(const char *ptr
, long len
)
11929 struct RString fake_str
;
11930 return register_fstring(setup_fake_str(&fake_str
, ptr
, len
, ENCINDEX_US_ASCII
), TRUE
);
11934 rb_interned_str_cstr(const char *ptr
)
11936 return rb_interned_str(ptr
, strlen(ptr
));
11940 rb_enc_interned_str(const char *ptr
, long len
, rb_encoding
*enc
)
11942 if (UNLIKELY(rb_enc_autoload_p(enc
))) {
11943 rb_enc_autoload(enc
);
11946 struct RString fake_str
;
11947 return register_fstring(rb_setup_fake_str(&fake_str
, ptr
, len
, enc
), TRUE
);
11951 rb_enc_interned_str_cstr(const char *ptr
, rb_encoding
*enc
)
11953 return rb_enc_interned_str(ptr
, strlen(ptr
), enc
);
11957 * A \String object has an arbitrary sequence of bytes,
11958 * typically representing text or binary data.
11959 * A \String object may be created using String::new or as literals.
11961 * String objects differ from Symbol objects in that Symbol objects are
11962 * designed to be used as identifiers, instead of text or data.
11964 * You can create a \String object explicitly with:
11966 * - A {string literal}[doc/syntax/literals_rdoc.html#label-String+Literals].
11967 * - A {heredoc literal}[doc/syntax/literals_rdoc.html#label-Here+Document+Literals].
11969 * You can convert certain objects to Strings with:
11971 * - \Method {String}[Kernel.html#method-i-String].
11973 * Some \String methods modify +self+.
11974 * Typically, a method whose name ends with <tt>!</tt> modifies +self+
11975 * and returns +self+;
11976 * often a similarly named method (without the <tt>!</tt>)
11977 * returns a new string.
11979 * In general, if there exist both bang and non-bang version of method,
11980 * the bang! mutates and the non-bang! does not.
11981 * However, a method without a bang can also mutate, such as String#replace.
11983 * == Substitution Methods
11985 * These methods perform substitutions:
11987 * - String#sub: One substitution (or none); returns a new string.
11988 * - String#sub!: One substitution (or none); returns +self+.
11989 * - String#gsub: Zero or more substitutions; returns a new string.
11990 * - String#gsub!: Zero or more substitutions; returns +self+.
11992 * Each of these methods takes:
11994 * - A first argument, +pattern+ (string or regexp),
11995 * that specifies the substring(s) to be replaced.
11997 * - Either of these:
11999 * - A second argument, +replacement+ (string or hash),
12000 * that determines the replacing string.
12001 * - A block that will determine the replacing string.
12003 * The examples in this section mostly use methods String#sub and String#gsub;
12004 * the principles illustrated apply to all four substitution methods.
12006 * <b>Argument +pattern+</b>
12008 * Argument +pattern+ is commonly a regular expression:
12011 * s.sub(/[aeiou]/, '*') # => "h*llo"
12012 * s.gsub(/[aeiou]/, '*') # => "h*ll*"
12013 * s.gsub(/[aeiou]/, '') # => "hll"
12014 * s.sub(/ell/, 'al') # => "halo"
12015 * s.gsub(/xyzzy/, '*') # => "hello"
12016 * 'THX1138'.gsub(/\d+/, '00') # => "THX00"
12018 * When +pattern+ is a string, all its characters are treated
12019 * as ordinary characters (not as regexp special characters):
12021 * 'THX1138'.gsub('\d+', '00') # => "THX1138"
12023 * <b>\String +replacement+</b>
12025 * If +replacement+ is a string, that string will determine
12026 * the replacing string that is to be substituted for the matched text.
12028 * Each of the examples above uses a simple string as the replacing string.
12030 * \String +replacement+ may contain back-references to the pattern's captures:
12032 * - <tt>\n</tt> (_n_ a non-negative integer) refers to <tt>$n</tt>.
12033 * - <tt>\k<name></tt> refers to the named capture +name+.
12035 * See rdoc-ref:regexp.rdoc for details.
12037 * Note that within the string +replacement+, a character combination
12038 * such as <tt>$&</tt> is treated as ordinary text, and not as
12039 * a special match variable.
12040 * However, you may refer to some special match variables using these
12043 * - <tt>\&</tt> and <tt>\0</tt> correspond to <tt>$&</tt>,
12044 * which contains the complete matched text.
12045 * - <tt>\'</tt> corresponds to <tt>$'</tt>,
12046 * which contains string after match.
12047 * - <tt>\`</tt> corresponds to <tt>$`</tt>,
12048 * which contains string before match.
12049 * - <tt>\+</tt> corresponds to <tt>$+</tt>,
12050 * which contains last capture group.
12052 * See rdoc-ref:regexp.rdoc for details.
12054 * Note that <tt>\\\\</tt> is interpreted as an escape, i.e., a single backslash.
12056 * Note also that a string literal consumes backslashes.
12057 * See {String Literals}[doc/syntax/literals_rdoc.html#label-String+Literals] for details about string literals.
12059 * A back-reference is typically preceded by an additional backslash.
12060 * For example, if you want to write a back-reference <tt>\&</tt> in
12061 * +replacement+ with a double-quoted string literal, you need to write
12062 * <tt>"..\\\\&.."</tt>.
12064 * If you want to write a non-back-reference string <tt>\&</tt> in
12065 * +replacement+, you need first to escape the backslash to prevent
12066 * this method from interpreting it as a back-reference, and then you
12067 * need to escape the backslashes again to prevent a string literal from
12068 * consuming them: <tt>"..\\\\\\\\&.."</tt>.
12070 * You may want to use the block form to avoid a lot of backslashes.
12072 * <b>\Hash +replacement+</b>
12074 * If argument +replacement+ is a hash, and +pattern+ matches one of its keys,
12075 * the replacing string is the value for that key:
12077 * h = {'foo' => 'bar', 'baz' => 'bat'}
12078 * 'food'.sub('foo', h) # => "bard"
12080 * Note that a symbol key does not match:
12082 * h = {foo: 'bar', baz: 'bat'}
12083 * 'food'.sub('foo', h) # => "d"
12087 * In the block form, the current match string is passed to the block;
12088 * the block's return value becomes the replacing string:
12091 * '1234'.gsub(/\d/) {|match| s.succ! } # => "ABCD"
12093 * Special match variables such as <tt>$1</tt>, <tt>$2</tt>, <tt>$`</tt>,
12094 * <tt>$&</tt>, and <tt>$'</tt> are set appropriately.
12099 * First, what's elsewhere. \Class \String:
12101 * - Inherits from {class Object}[Object.html#class-Object-label-What-27s+Here].
12102 * - Includes {module Comparable}[Comparable.html#module-Comparable-label-What-27s+Here].
12104 * Here, class \String provides methods that are useful for:
12106 * - {Creating a String}[#class-String-label-Methods+for+Creating+a+String]
12107 * - {Frozen/Unfrozen Strings}[#class-String-label-Methods+for+a+Frozen-2FUnfrozen+String]
12108 * - {Querying}[#class-String-label-Methods+for+Querying]
12109 * - {Comparing}[#class-String-label-Methods+for+Comparing]
12110 * - {Modifying a String}[#class-String-label-Methods+for+Modifying+a+String]
12111 * - {Converting to New String}[#class-String-label-Methods+for+Converting+to+New+String]
12112 * - {Converting to Non-String}[#class-String-label-Methods+for+Converting+to+Non--5CString]
12113 * - {Iterating}[#class-String-label-Methods+for+Iterating]
12115 * === Methods for Creating a \String
12117 * - ::new:: Returns a new string.
12118 * - ::try_convert:: Returns a new string created from a given object.
12120 * === Methods for a Frozen/Unfrozen String
12122 * - {#+string}[#method-i-2B-40]:: Returns a string that is not frozen:
12123 * +self+, if not frozen; +self.dup+ otherwise.
12124 * - {#-string}[#method-i-2D-40]:: Returns a string that is frozen:
12125 * +self+, if already frozen; +self.freeze+ otherwise.
12126 * - #freeze:: Freezes +self+, if not already frozen; returns +self+.
12128 * === Methods for Querying
12132 * - #length, #size:: Returns the count of characters (not bytes).
12133 * - #empty?:: Returns +true+ if +self.length+ is zero; +false+ otherwise.
12134 * - #bytesize:: Returns the count of bytes.
12135 * - #count:: Returns the count of substrings matching given strings.
12139 * - {#=~}[#method-i-3D~]:: Returns the index of the first substring that matches a given Regexp or other object;
12140 * returns +nil+ if no match is found.
12141 * - #index:: Returns the index of the _first_ occurrence of a given substring;
12142 * returns +nil+ if none found.
12143 * - #rindex:: Returns the index of the _last_ occurrence of a given substring;
12144 * returns +nil+ if none found.
12145 * - #include?:: Returns +true+ if the string contains a given substring; +false+ otherwise.
12146 * - #match:: Returns a MatchData object if the string matches a given Regexp; +nil+ otherwise.
12147 * - #match?:: Returns +true+ if the string matches a given Regexp; +false+ otherwise.
12148 * - #start_with?:: Returns +true+ if the string begins with any of the given substrings.
12149 * - #end_with?:: Returns +true+ if the string ends with any of the given substrings.
12153 * - #encoding:: Returns the Encoding object that represents the encoding of the string.
12154 * - #unicode_normalized?:: Returns +true+ if the string is in Unicode normalized form; +false+ otherwise.
12155 * - #valid_encoding?:: Returns +true+ if the string contains only characters that are valid
12156 * for its encoding.
12157 * - #ascii_only?:: Returns +true+ if the string has only ASCII characters; +false+ otherwise.
12161 * - #sum:: Returns a basic checksum for the string: the sum of each byte.
12162 * - #hash:: Returns the integer hash code.
12164 * === Methods for Comparing
12166 * - {#==, #===}[#method-i-3D-3D]:: Returns +true+ if a given other string has the same content as +self+.
12167 * - #eql?:: Returns +true+ if the content is the same as the given other string.
12168 * - {#<=>}[#method-i-3C-3D-3E]:: Returns -1, 0, or 1 as a given other string is smaller than, equal to, or larger than +self+.
12169 * - #casecmp:: Ignoring case, returns -1, 0, or 1 as a given
12170 * other string is smaller than, equal to, or larger than +self+.
12171 * - #casecmp?:: Returns +true+ if the string is equal to a given string after Unicode case folding;
12172 * +false+ otherwise.
12174 * === Methods for Modifying a \String
12176 * Each of these methods modifies +self+.
12180 * - #insert:: Returns +self+ with a given string inserted at a given offset.
12181 * - #<<:: Returns +self+ concatenated with a given string or integer.
12185 * - #sub!:: Replaces the first substring that matches a given pattern with a given replacement string;
12186 * returns +self+ if any changes, +nil+ otherwise.
12187 * - #gsub!:: Replaces each substring that matches a given pattern with a given replacement string;
12188 * returns +self+ if any changes, +nil+ otherwise.
12189 * - #succ!, #next!:: Returns +self+ modified to become its own successor.
12190 * - #replace:: Returns +self+ with its entire content replaced by a given string.
12191 * - #reverse!:: Returns +self+ with its characters in reverse order.
12192 * - #setbyte:: Sets the byte at a given integer offset to a given value; returns the argument.
12193 * - #tr!:: Replaces specified characters in +self+ with specified replacement characters;
12194 * returns +self+ if any changes, +nil+ otherwise.
12195 * - #tr_s!:: Replaces specified characters in +self+ with specified replacement characters,
12196 * removing duplicates from the substrings that were modified;
12197 * returns +self+ if any changes, +nil+ otherwise.
12201 * - #capitalize!:: Upcases the initial character and downcases all others;
12202 * returns +self+ if any changes, +nil+ otherwise.
12203 * - #downcase!:: Downcases all characters; returns +self+ if any changes, +nil+ otherwise.
12204 * - #upcase!:: Upcases all characters; returns +self+ if any changes, +nil+ otherwise.
12205 * - #swapcase!:: Upcases each downcase character and downcases each upcase character;
12206 * returns +self+ if any changes, +nil+ otherwise.
12210 * - #encode!:: Returns +self+ with all characters transcoded from one given encoding into another.
12211 * - #unicode_normalize!:: Unicode-normalizes +self+; returns +self+.
12212 * - #scrub!:: Replaces each invalid byte with a given character; returns +self+.
12213 * - #force_encoding:: Changes the encoding to a given encoding; returns +self+.
12217 * - #clear:: Removes all content, so that +self+ is empty; returns +self+.
12218 * - #slice!, #[]=:: Removes a substring determined by a given index, start/length, range, regexp, or substring.
12219 * - #squeeze!:: Removes contiguous duplicate characters; returns +self+.
12220 * - #delete!:: Removes characters as determined by the intersection of substring arguments.
12221 * - #lstrip!:: Removes leading whitespace; returns +self+ if any changes, +nil+ otherwise.
12222 * - #rstrip!:: Removes trailing whitespace; returns +self+ if any changes, +nil+ otherwise.
12223 * - #strip!:: Removes leading and trailing whitespace; returns +self+ if any changes, +nil+ otherwise.
12224 * - #chomp!:: Removes trailing record separator, if found; returns +self+ if any changes, +nil+ otherwise.
12225 * - #chop!:: Removes trailing whitespace if found, otherwise removes the last character;
12226 * returns +self+ if any changes, +nil+ otherwise.
12228 * === Methods for Converting to New \String
12230 * Each of these methods returns a new \String based on +self+,
12231 * often just a modified copy of +self+.
12235 * - #*:: Returns the concatenation of multiple copies of +self+,
12236 * - #+:: Returns the concatenation of +self+ and a given other string.
12237 * - #center:: Returns a copy of +self+ centered between pad substring.
12238 * - #concat:: Returns the concatenation of +self+ with given other strings.
12239 * - #prepend:: Returns the concatenation of a given other string with +self+.
12240 * - #ljust:: Returns a copy of +self+ of a given length, right-padded with a given other string.
12241 * - #rjust:: Returns a copy of +self+ of a given length, left-padded with a given other string.
12245 * - #b:: Returns a copy of +self+ with ASCII-8BIT encoding.
12246 * - #scrub:: Returns a copy of +self+ with each invalid byte replaced with a given character.
12247 * - #unicode_normalize:: Returns a copy of +self+ with each character Unicode-normalized.
12248 * - #encode:: Returns a copy of +self+ with all characters transcoded from one given encoding into another.
12252 * - #dump:: Returns a copy of +self with all non-printing characters replaced by \xHH notation
12253 * and all special characters escaped.
12254 * - #undump:: Returns a copy of +self with all <tt>\xNN</tt> notation replace by <tt>\uNNNN</tt> notation
12255 * and all escaped characters unescaped.
12256 * - #sub:: Returns a copy of +self+ with the first substring matching a given pattern
12257 * replaced with a given replacement string;.
12258 * - #gsub:: Returns a copy of +self+ with each substring that matches a given pattern
12259 * replaced with a given replacement string.
12260 * - #succ, #next:: Returns the string that is the successor to +self+.
12261 * - #reverse:: Returns a copy of +self+ with its characters in reverse order.
12262 * - #tr:: Returns a copy of +self+ with specified characters replaced with specified replacement characters.
12263 * - #tr_s:: Returns a copy of +self+ with specified characters replaced with specified replacement characters,
12264 * removing duplicates from the substrings that were modified.
12265 * - #%:: Returns the string resulting from formatting a given object into +self+
12269 * - #capitalize:: Returns a copy of +self+ with the first character upcased
12270 * and all other characters downcased.
12271 * - #downcase:: Returns a copy of +self+ with all characters downcased.
12272 * - #upcase:: Returns a copy of +self+ with all characters upcased.
12273 * - #swapcase:: Returns a copy of +self+ with all upcase characters downcased
12274 * and all downcase characters upcased.
12278 * - #delete:: Returns a copy of +self+ with characters removed
12279 * - #delete_prefix:: Returns a copy of +self+ with a given prefix removed.
12280 * - #delete_suffix:: Returns a copy of +self+ with a given suffix removed.
12281 * - #lstrip:: Returns a copy of +self+ with leading whitespace removed.
12282 * - #rstrip:: Returns a copy of +self+ with trailing whitespace removed.
12283 * - #strip:: Returns a copy of +self+ with leading and trailing whitespace removed.
12284 * - #chomp:: Returns a copy of +self+ with a trailing record separator removed, if found.
12285 * - #chop:: Returns a copy of +self+ with trailing whitespace or the last character removed.
12286 * - #squeeze:: Returns a copy of +self+ with contiguous duplicate characters removed.
12287 * - #[], #slice:: Returns a substring determined by a given index, start/length, or range, or string.
12288 * - #byteslice:: Returns a substring determined by a given index, start/length, or range.
12289 * - #chr:: Returns the first character.
12293 * - #to_s, $to_str:: If +self+ is a subclass of \String, returns +self+ copied into a \String;
12294 * otherwise, returns +self+.
12296 * === Methods for Converting to Non-\String
12298 * Each of these methods converts the contents of +self+ to a non-\String.
12300 * <em>Characters, Bytes, and Clusters</em>
12302 * - #bytes:: Returns an array of the bytes in +self+.
12303 * - #chars:: Returns an array of the characters in +self+.
12304 * - #codepoints:: Returns an array of the integer ordinals in +self+.
12305 * - #getbyte:: Returns an integer byte as determined by a given index.
12306 * - #grapheme_clusters:: Returns an array of the grapheme clusters in +self+.
12310 * - #lines:: Returns an array of the lines in +self+, as determined by a given record separator.
12311 * - #partition:: Returns a 3-element array determined by the first substring that matches
12312 * a given substring or regexp,
12313 * - #rpartition:: Returns a 3-element array determined by the last substring that matches
12314 * a given substring or regexp,
12315 * - #split:: Returns an array of substrings determined by a given delimiter -- regexp or string --
12316 * or, if a block given, passes those substrings to the block.
12320 * - #scan:: Returns an array of substrings matching a given regexp or string, or,
12321 * if a block given, passes each matching substring to the block.
12322 * - #unpack:: Returns an array of substrings extracted from +self+ according to a given format.
12323 * - #unpack1:: Returns the first substring extracted from +self+ according to a given format.
12327 * - #hex:: Returns the integer value of the leading characters, interpreted as hexadecimal digits.
12328 * - #oct:: Returns the integer value of the leading characters, interpreted as octal digits.
12329 * - #ord:: Returns the integer ordinal of the first character in +self+.
12330 * - #to_i:: Returns the integer value of leading characters, interpreted as an integer.
12331 * - #to_f:: Returns the floating-point value of leading characters, interpreted as a floating-point number.
12333 * <em>Strings and Symbols</em>
12335 * - #inspect:: Returns copy of +self+, enclosed in double-quotes, with special characters escaped.
12336 * - #to_sym, #intern:: Returns the symbol corresponding to +self+.
12338 * === Methods for Iterating
12340 * - #each_byte:: Calls the given block with each successive byte in +self+.
12341 * - #each_char:: Calls the given block with each successive character in +self+.
12342 * - #each_codepoint:: Calls the given block with each successive integer codepoint in +self+.
12343 * - #each_grapheme_cluster:: Calls the given block with each successive grapheme cluster in +self+.
12344 * - #each_line:: Calls the given block with each successive line in +self+,
12345 * as determined by a given record separator.
12346 * - #upto:: Calls the given block with each string value returned by successive calls to #succ.
12352 rb_cString
= rb_define_class("String", rb_cObject
);
12353 assert(rb_vm_fstring_table());
12354 st_foreach(rb_vm_fstring_table(), fstring_set_class_i
, rb_cString
);
12355 rb_include_module(rb_cString
, rb_mComparable
);
12356 rb_define_alloc_func(rb_cString
, empty_str_alloc
);
12357 rb_define_singleton_method(rb_cString
, "try_convert", rb_str_s_try_convert
, 1);
12358 rb_define_method(rb_cString
, "initialize", rb_str_init
, -1);
12359 rb_define_method(rb_cString
, "initialize_copy", rb_str_replace
, 1);
12360 rb_define_method(rb_cString
, "<=>", rb_str_cmp_m
, 1);
12361 rb_define_method(rb_cString
, "==", rb_str_equal
, 1);
12362 rb_define_method(rb_cString
, "===", rb_str_equal
, 1);
12363 rb_define_method(rb_cString
, "eql?", rb_str_eql
, 1);
12364 rb_define_method(rb_cString
, "hash", rb_str_hash_m
, 0);
12365 rb_define_method(rb_cString
, "casecmp", rb_str_casecmp
, 1);
12366 rb_define_method(rb_cString
, "casecmp?", rb_str_casecmp_p
, 1);
12367 rb_define_method(rb_cString
, "+", rb_str_plus
, 1);
12368 rb_define_method(rb_cString
, "*", rb_str_times
, 1);
12369 rb_define_method(rb_cString
, "%", rb_str_format_m
, 1);
12370 rb_define_method(rb_cString
, "[]", rb_str_aref_m
, -1);
12371 rb_define_method(rb_cString
, "[]=", rb_str_aset_m
, -1);
12372 rb_define_method(rb_cString
, "insert", rb_str_insert
, 2);
12373 rb_define_method(rb_cString
, "length", rb_str_length
, 0);
12374 rb_define_method(rb_cString
, "size", rb_str_length
, 0);
12375 rb_define_method(rb_cString
, "bytesize", rb_str_bytesize
, 0);
12376 rb_define_method(rb_cString
, "empty?", rb_str_empty
, 0);
12377 rb_define_method(rb_cString
, "=~", rb_str_match
, 1);
12378 rb_define_method(rb_cString
, "match", rb_str_match_m
, -1);
12379 rb_define_method(rb_cString
, "match?", rb_str_match_m_p
, -1);
12380 rb_define_method(rb_cString
, "succ", rb_str_succ
, 0);
12381 rb_define_method(rb_cString
, "succ!", rb_str_succ_bang
, 0);
12382 rb_define_method(rb_cString
, "next", rb_str_succ
, 0);
12383 rb_define_method(rb_cString
, "next!", rb_str_succ_bang
, 0);
12384 rb_define_method(rb_cString
, "upto", rb_str_upto
, -1);
12385 rb_define_method(rb_cString
, "index", rb_str_index_m
, -1);
12386 rb_define_method(rb_cString
, "rindex", rb_str_rindex_m
, -1);
12387 rb_define_method(rb_cString
, "replace", rb_str_replace
, 1);
12388 rb_define_method(rb_cString
, "clear", rb_str_clear
, 0);
12389 rb_define_method(rb_cString
, "chr", rb_str_chr
, 0);
12390 rb_define_method(rb_cString
, "getbyte", rb_str_getbyte
, 1);
12391 rb_define_method(rb_cString
, "setbyte", rb_str_setbyte
, 2);
12392 rb_define_method(rb_cString
, "byteslice", rb_str_byteslice
, -1);
12393 rb_define_method(rb_cString
, "scrub", str_scrub
, -1);
12394 rb_define_method(rb_cString
, "scrub!", str_scrub_bang
, -1);
12395 rb_define_method(rb_cString
, "freeze", rb_str_freeze
, 0);
12396 rb_define_method(rb_cString
, "+@", str_uplus
, 0);
12397 rb_define_method(rb_cString
, "-@", str_uminus
, 0);
12399 rb_define_method(rb_cString
, "to_i", rb_str_to_i
, -1);
12400 rb_define_method(rb_cString
, "to_f", rb_str_to_f
, 0);
12401 rb_define_method(rb_cString
, "to_s", rb_str_to_s
, 0);
12402 rb_define_method(rb_cString
, "to_str", rb_str_to_s
, 0);
12403 rb_define_method(rb_cString
, "inspect", rb_str_inspect
, 0);
12404 rb_define_method(rb_cString
, "dump", rb_str_dump
, 0);
12405 rb_define_method(rb_cString
, "undump", str_undump
, 0);
12407 sym_ascii
= ID2SYM(rb_intern_const("ascii"));
12408 sym_turkic
= ID2SYM(rb_intern_const("turkic"));
12409 sym_lithuanian
= ID2SYM(rb_intern_const("lithuanian"));
12410 sym_fold
= ID2SYM(rb_intern_const("fold"));
12412 rb_define_method(rb_cString
, "upcase", rb_str_upcase
, -1);
12413 rb_define_method(rb_cString
, "downcase", rb_str_downcase
, -1);
12414 rb_define_method(rb_cString
, "capitalize", rb_str_capitalize
, -1);
12415 rb_define_method(rb_cString
, "swapcase", rb_str_swapcase
, -1);
12417 rb_define_method(rb_cString
, "upcase!", rb_str_upcase_bang
, -1);
12418 rb_define_method(rb_cString
, "downcase!", rb_str_downcase_bang
, -1);
12419 rb_define_method(rb_cString
, "capitalize!", rb_str_capitalize_bang
, -1);
12420 rb_define_method(rb_cString
, "swapcase!", rb_str_swapcase_bang
, -1);
12422 rb_define_method(rb_cString
, "hex", rb_str_hex
, 0);
12423 rb_define_method(rb_cString
, "oct", rb_str_oct
, 0);
12424 rb_define_method(rb_cString
, "split", rb_str_split_m
, -1);
12425 rb_define_method(rb_cString
, "lines", rb_str_lines
, -1);
12426 rb_define_method(rb_cString
, "bytes", rb_str_bytes
, 0);
12427 rb_define_method(rb_cString
, "chars", rb_str_chars
, 0);
12428 rb_define_method(rb_cString
, "codepoints", rb_str_codepoints
, 0);
12429 rb_define_method(rb_cString
, "grapheme_clusters", rb_str_grapheme_clusters
, 0);
12430 rb_define_method(rb_cString
, "reverse", rb_str_reverse
, 0);
12431 rb_define_method(rb_cString
, "reverse!", rb_str_reverse_bang
, 0);
12432 rb_define_method(rb_cString
, "concat", rb_str_concat_multi
, -1);
12433 rb_define_method(rb_cString
, "<<", rb_str_concat
, 1);
12434 rb_define_method(rb_cString
, "prepend", rb_str_prepend_multi
, -1);
12435 rb_define_method(rb_cString
, "crypt", rb_str_crypt
, 1);
12436 rb_define_method(rb_cString
, "intern", rb_str_intern
, 0); /* in symbol.c */
12437 rb_define_method(rb_cString
, "to_sym", rb_str_intern
, 0); /* in symbol.c */
12438 rb_define_method(rb_cString
, "ord", rb_str_ord
, 0);
12440 rb_define_method(rb_cString
, "include?", rb_str_include
, 1);
12441 rb_define_method(rb_cString
, "start_with?", rb_str_start_with
, -1);
12442 rb_define_method(rb_cString
, "end_with?", rb_str_end_with
, -1);
12444 rb_define_method(rb_cString
, "scan", rb_str_scan
, 1);
12446 rb_define_method(rb_cString
, "ljust", rb_str_ljust
, -1);
12447 rb_define_method(rb_cString
, "rjust", rb_str_rjust
, -1);
12448 rb_define_method(rb_cString
, "center", rb_str_center
, -1);
12450 rb_define_method(rb_cString
, "sub", rb_str_sub
, -1);
12451 rb_define_method(rb_cString
, "gsub", rb_str_gsub
, -1);
12452 rb_define_method(rb_cString
, "chop", rb_str_chop
, 0);
12453 rb_define_method(rb_cString
, "chomp", rb_str_chomp
, -1);
12454 rb_define_method(rb_cString
, "strip", rb_str_strip
, 0);
12455 rb_define_method(rb_cString
, "lstrip", rb_str_lstrip
, 0);
12456 rb_define_method(rb_cString
, "rstrip", rb_str_rstrip
, 0);
12457 rb_define_method(rb_cString
, "delete_prefix", rb_str_delete_prefix
, 1);
12458 rb_define_method(rb_cString
, "delete_suffix", rb_str_delete_suffix
, 1);
12460 rb_define_method(rb_cString
, "sub!", rb_str_sub_bang
, -1);
12461 rb_define_method(rb_cString
, "gsub!", rb_str_gsub_bang
, -1);
12462 rb_define_method(rb_cString
, "chop!", rb_str_chop_bang
, 0);
12463 rb_define_method(rb_cString
, "chomp!", rb_str_chomp_bang
, -1);
12464 rb_define_method(rb_cString
, "strip!", rb_str_strip_bang
, 0);
12465 rb_define_method(rb_cString
, "lstrip!", rb_str_lstrip_bang
, 0);
12466 rb_define_method(rb_cString
, "rstrip!", rb_str_rstrip_bang
, 0);
12467 rb_define_method(rb_cString
, "delete_prefix!", rb_str_delete_prefix_bang
, 1);
12468 rb_define_method(rb_cString
, "delete_suffix!", rb_str_delete_suffix_bang
, 1);
12470 rb_define_method(rb_cString
, "tr", rb_str_tr
, 2);
12471 rb_define_method(rb_cString
, "tr_s", rb_str_tr_s
, 2);
12472 rb_define_method(rb_cString
, "delete", rb_str_delete
, -1);
12473 rb_define_method(rb_cString
, "squeeze", rb_str_squeeze
, -1);
12474 rb_define_method(rb_cString
, "count", rb_str_count
, -1);
12476 rb_define_method(rb_cString
, "tr!", rb_str_tr_bang
, 2);
12477 rb_define_method(rb_cString
, "tr_s!", rb_str_tr_s_bang
, 2);
12478 rb_define_method(rb_cString
, "delete!", rb_str_delete_bang
, -1);
12479 rb_define_method(rb_cString
, "squeeze!", rb_str_squeeze_bang
, -1);
12481 rb_define_method(rb_cString
, "each_line", rb_str_each_line
, -1);
12482 rb_define_method(rb_cString
, "each_byte", rb_str_each_byte
, 0);
12483 rb_define_method(rb_cString
, "each_char", rb_str_each_char
, 0);
12484 rb_define_method(rb_cString
, "each_codepoint", rb_str_each_codepoint
, 0);
12485 rb_define_method(rb_cString
, "each_grapheme_cluster", rb_str_each_grapheme_cluster
, 0);
12487 rb_define_method(rb_cString
, "sum", rb_str_sum
, -1);
12489 rb_define_method(rb_cString
, "slice", rb_str_aref_m
, -1);
12490 rb_define_method(rb_cString
, "slice!", rb_str_slice_bang
, -1);
12492 rb_define_method(rb_cString
, "partition", rb_str_partition
, 1);
12493 rb_define_method(rb_cString
, "rpartition", rb_str_rpartition
, 1);
12495 rb_define_method(rb_cString
, "encoding", rb_obj_encoding
, 0); /* in encoding.c */
12496 rb_define_method(rb_cString
, "force_encoding", rb_str_force_encoding
, 1);
12497 rb_define_method(rb_cString
, "b", rb_str_b
, 0);
12498 rb_define_method(rb_cString
, "valid_encoding?", rb_str_valid_encoding_p
, 0);
12499 rb_define_method(rb_cString
, "ascii_only?", rb_str_is_ascii_only_p
, 0);
12501 /* define UnicodeNormalize module here so that we don't have to look it up */
12502 mUnicodeNormalize
= rb_define_module("UnicodeNormalize");
12503 id_normalize
= rb_intern_const("normalize");
12504 id_normalized_p
= rb_intern_const("normalized?");
12506 rb_define_method(rb_cString
, "unicode_normalize", rb_str_unicode_normalize
, -1);
12507 rb_define_method(rb_cString
, "unicode_normalize!", rb_str_unicode_normalize_bang
, -1);
12508 rb_define_method(rb_cString
, "unicode_normalized?", rb_str_unicode_normalized_p
, -1);
12511 rb_define_hooked_variable("$;", &rb_fs
, 0, rb_fs_setter
);
12512 rb_define_hooked_variable("$-F", &rb_fs
, 0, rb_fs_setter
);
12513 rb_gc_register_address(&rb_fs
);
12515 rb_cSymbol
= rb_define_class("Symbol", rb_cObject
);
12516 rb_include_module(rb_cSymbol
, rb_mComparable
);
12517 rb_undef_alloc_func(rb_cSymbol
);
12518 rb_undef_method(CLASS_OF(rb_cSymbol
), "new");
12519 rb_define_singleton_method(rb_cSymbol
, "all_symbols", sym_all_symbols
, 0);
12521 rb_define_method(rb_cSymbol
, "==", sym_equal
, 1);
12522 rb_define_method(rb_cSymbol
, "===", sym_equal
, 1);
12523 rb_define_method(rb_cSymbol
, "inspect", sym_inspect
, 0);
12524 rb_define_method(rb_cSymbol
, "to_s", rb_sym_to_s
, 0);
12525 rb_define_method(rb_cSymbol
, "id2name", rb_sym_to_s
, 0);
12526 rb_define_method(rb_cSymbol
, "name", rb_sym2str
, 0);
12527 rb_define_method(rb_cSymbol
, "intern", sym_to_sym
, 0);
12528 rb_define_method(rb_cSymbol
, "to_sym", sym_to_sym
, 0);
12529 rb_define_method(rb_cSymbol
, "to_proc", rb_sym_to_proc
, 0);
12530 rb_define_method(rb_cSymbol
, "succ", sym_succ
, 0);
12531 rb_define_method(rb_cSymbol
, "next", sym_succ
, 0);
12533 rb_define_method(rb_cSymbol
, "<=>", sym_cmp
, 1);
12534 rb_define_method(rb_cSymbol
, "casecmp", sym_casecmp
, 1);
12535 rb_define_method(rb_cSymbol
, "casecmp?", sym_casecmp_p
, 1);
12536 rb_define_method(rb_cSymbol
, "=~", sym_match
, 1);
12538 rb_define_method(rb_cSymbol
, "[]", sym_aref
, -1);
12539 rb_define_method(rb_cSymbol
, "slice", sym_aref
, -1);
12540 rb_define_method(rb_cSymbol
, "length", sym_length
, 0);
12541 rb_define_method(rb_cSymbol
, "size", sym_length
, 0);
12542 rb_define_method(rb_cSymbol
, "empty?", sym_empty
, 0);
12543 rb_define_method(rb_cSymbol
, "match", sym_match_m
, -1);
12544 rb_define_method(rb_cSymbol
, "match?", sym_match_m_p
, -1);
12546 rb_define_method(rb_cSymbol
, "upcase", sym_upcase
, -1);
12547 rb_define_method(rb_cSymbol
, "downcase", sym_downcase
, -1);
12548 rb_define_method(rb_cSymbol
, "capitalize", sym_capitalize
, -1);
12549 rb_define_method(rb_cSymbol
, "swapcase", sym_swapcase
, -1);
12551 rb_define_method(rb_cSymbol
, "start_with?", sym_start_with
, -1);
12552 rb_define_method(rb_cSymbol
, "end_with?", sym_end_with
, -1);
12554 rb_define_method(rb_cSymbol
, "encoding", sym_encoding
, 0);