1 /**********************************************************************
6 created at: Mon Aug 9 18:24:49 JST 1993
8 Copyright (C) 1993-2007 Yukihiro Matsumoto
10 **********************************************************************/
12 #include "ruby/ruby.h"
14 #include "ruby/encoding.h"
15 #include "ruby/util.h"
19 VALUE rb_eRegexpError
;
21 typedef char onig_errmsg_buffer
[ONIG_MAX_ERROR_MESSAGE_LEN
];
23 #define BEG(no) regs->beg[no]
24 #define END(no) regs->end[no]
26 #if 'a' == 97 /* it's ascii */
27 static const char casetable
[] = {
28 '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
29 '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
30 '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
31 '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
32 /* ' ' '!' '"' '#' '$' '%' '&' ''' */
33 '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
34 /* '(' ')' '*' '+' ',' '-' '.' '/' */
35 '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
36 /* '0' '1' '2' '3' '4' '5' '6' '7' */
37 '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
38 /* '8' '9' ':' ';' '<' '=' '>' '?' */
39 '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
40 /* '@' 'A' 'B' 'C' 'D' 'E' 'F' 'G' */
41 '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
42 /* 'H' 'I' 'J' 'K' 'L' 'M' 'N' 'O' */
43 '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
44 /* 'P' 'Q' 'R' 'S' 'T' 'U' 'V' 'W' */
45 '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
46 /* 'X' 'Y' 'Z' '[' '\' ']' '^' '_' */
47 '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137',
48 /* '`' 'a' 'b' 'c' 'd' 'e' 'f' 'g' */
49 '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
50 /* 'h' 'i' 'j' 'k' 'l' 'm' 'n' 'o' */
51 '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
52 /* 'p' 'q' 'r' 's' 't' 'u' 'v' 'w' */
53 '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
54 /* 'x' 'y' 'z' '{' '|' '}' '~' */
55 '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177',
56 '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
57 '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
58 '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
59 '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
60 '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
61 '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
62 '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
63 '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
64 '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
65 '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
66 '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327',
67 '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337',
68 '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
69 '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
70 '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
71 '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377',
74 # error >>> "You lose. You will need a translation table for your character set." <<<
78 rb_memcicmp(const void *x
, const void *y
, long len
)
80 const unsigned char *p1
= x
, *p2
= y
;
84 if ((tmp
= casetable
[(unsigned)*p1
++] - casetable
[(unsigned)*p2
++]))
93 rb_memcmp(const void *p1
, const void *p2
, long len
)
95 return memcmp(p1
, p2
, len
);
99 rb_memsearch_ss(const unsigned char *xs
, long m
, const unsigned char *ys
, long n
)
101 const unsigned char *x
= xs
, *xe
= xs
+ m
;
102 const unsigned char *y
= ys
, *ye
= ys
+ n
;
104 # if SIZEOF_VALUE == 8
105 # define VALUE_MAX 0xFFFFFFFFFFFFFFFFULL
106 # elif SIZEOF_VALUE == 4
107 # define VALUE_MAX 0xFFFFFFFFUL
110 VALUE hx
, hy
, mask
= VALUE_MAX
>> ((SIZEOF_VALUE
- m
) * CHAR_BIT
);
112 if (m
> SIZEOF_VALUE
)
113 rb_bug("!!too long pattern string!!");
115 /* Prepare hash value */
116 for (hx
= *x
++, hy
= *y
++; x
< xe
; ++x
, ++y
) {
135 rb_memsearch_qs(const unsigned char *xs
, long m
, const unsigned char *ys
, long n
)
137 const unsigned char *x
= xs
, *xe
= xs
+ m
;
138 const unsigned char *y
= ys
;
139 VALUE i
, qstable
[256];
142 for (i
= 0; i
< 256; ++i
)
145 qstable
[*x
] = xe
- x
;
147 for (; y
+ m
<= ys
+ n
; y
+= *(qstable
+ y
[m
])) {
148 if (*xs
== *y
&& memcmp(xs
, y
, m
) == 0)
154 static inline unsigned int
155 rb_memsearch_qs_utf8_hash(const unsigned char *x
)
157 register const unsigned int mix
= 8353;
158 register unsigned int h
= *x
;
183 return (unsigned char)h
;
187 rb_memsearch_qs_utf8(const unsigned char *xs
, long m
, const unsigned char *ys
, long n
)
189 const unsigned char *x
= xs
, *xe
= xs
+ m
;
190 const unsigned char *y
= ys
;
191 VALUE i
, qstable
[512];
194 for (i
= 0; i
< 512; ++i
) {
197 for (; x
< xe
; ++x
) {
198 qstable
[rb_memsearch_qs_utf8_hash(x
)] = xe
- x
;
201 for (; y
+ m
<= ys
+ n
; y
+= qstable
[rb_memsearch_qs_utf8_hash(y
+m
)]) {
202 if (*xs
== *y
&& memcmp(xs
, y
, m
) == 0)
209 rb_memsearch(const void *x0
, long m
, const void *y0
, long n
, rb_encoding
*enc
)
211 const unsigned char *x
= x0
, *y
= y0
;
213 if (m
> n
) return -1;
215 return memcmp(x0
, y0
, m
) == 0 ? 0 : -1;
221 const unsigned char *ys
= y
, *ye
= ys
+ n
;
222 for (; y
< ye
; ++y
) {
228 else if (m
<= SIZEOF_VALUE
) {
229 return rb_memsearch_ss(x0
, m
, y0
, n
);
231 else if (enc
== rb_utf8_encoding()){
232 return rb_memsearch_qs_utf8(x0
, m
, y0
, n
);
235 return rb_memsearch_qs(x0
, m
, y0
, n
);
239 #define REG_LITERAL FL_USER5
240 #define REG_ENCODING_NONE FL_USER6
242 #define KCODE_FIXED FL_USER4
244 #define ARG_REG_OPTION_MASK \
245 (ONIG_OPTION_IGNORECASE|ONIG_OPTION_MULTILINE|ONIG_OPTION_EXTEND)
246 #define ARG_ENCODING_FIXED 16
247 #define ARG_ENCODING_NONE 32
250 char_to_option(int c
)
256 val
= ONIG_OPTION_IGNORECASE
;
259 val
= ONIG_OPTION_EXTEND
;
262 val
= ONIG_OPTION_MULTILINE
;
272 option_to_str(char str
[4], int options
)
275 if (options
& ONIG_OPTION_MULTILINE
) *p
++ = 'm';
276 if (options
& ONIG_OPTION_IGNORECASE
) *p
++ = 'i';
277 if (options
& ONIG_OPTION_EXTEND
) *p
++ = 'x';
283 rb_char_to_option_kcode(int c
, int *option
, int *kcode
)
290 return (*option
= ARG_ENCODING_NONE
);
292 *kcode
= rb_enc_find_index("EUC-JP");
295 *kcode
= rb_enc_find_index("Windows-31J");
298 *kcode
= rb_enc_find_index("UTF-8");
302 return (*option
= char_to_option(c
));
304 *option
= ARG_ENCODING_FIXED
;
309 rb_reg_check(VALUE re
)
311 if (!RREGEXP(re
)->ptr
|| !RREGEXP_SRC(re
) || !RREGEXP_SRC_PTR(re
)) {
312 rb_raise(rb_eTypeError
, "uninitialized Regexp");
317 rb_reg_expr_str(VALUE str
, const char *s
, long len
)
319 rb_encoding
*enc
= rb_enc_get(str
);
320 const char *p
, *pend
;
324 p
= s
; pend
= p
+ len
;
326 c
= rb_enc_ascget(p
, pend
, &clen
, enc
);
328 p
+= mbclen(p
, pend
, enc
);
330 else if (c
!= '/' && rb_enc_isprint(c
, enc
)) {
339 rb_str_buf_cat(str
, s
, len
);
344 c
= rb_enc_ascget(p
, pend
, &clen
, enc
);
345 if (c
== '\\' && p
+clen
< pend
) {
346 int n
= clen
+ mbclen(p
+clen
, pend
, enc
);
347 rb_str_buf_cat(str
, p
, n
);
353 rb_str_buf_cat(str
, &c
, 1);
354 rb_str_buf_cat(str
, p
, clen
);
357 int l
= mbclen(p
, pend
, enc
);
358 rb_str_buf_cat(str
, p
, l
);
362 else if (rb_enc_isprint(c
, enc
)) {
363 rb_str_buf_cat(str
, p
, clen
);
365 else if (!rb_enc_isspace(c
, enc
)) {
368 sprintf(b
, "\\x%02X", c
);
369 rb_str_buf_cat(str
, b
, 4);
372 rb_str_buf_cat(str
, p
, clen
);
380 rb_reg_desc(const char *s
, long len
, VALUE re
)
382 VALUE str
= rb_str_buf_new2("/");
384 rb_enc_copy(str
, re
);
385 rb_reg_expr_str(str
, s
, len
);
386 rb_str_buf_cat2(str
, "/");
390 if (*option_to_str(opts
, RREGEXP(re
)->ptr
->options
))
391 rb_str_buf_cat2(str
, opts
);
402 * Returns the original string of the pattern.
404 * /ab+c/ix.source #=> "ab+c"
406 * Note that escape sequences are retained as is.
408 * /\x20\+/.source #=> "\\x20\\+"
413 rb_reg_source(VALUE re
)
418 str
= rb_enc_str_new(RREGEXP_SRC_PTR(re
),RREGEXP_SRC_LEN(re
), rb_enc_get(re
));
419 if (OBJ_TAINTED(re
)) OBJ_TAINT(str
);
425 * rxp.inspect => string
427 * Produce a nicely formatted string-version of _rxp_. Perhaps surprisingly,
428 * <code>#inspect</code> actually produces the more natural version of
429 * the string than <code>#to_s</code>.
431 * /ab+c/ix.inspect #=> "/ab+c/ix"
436 rb_reg_inspect(VALUE re
)
438 if (!RREGEXP(re
)->ptr
|| !RREGEXP_SRC(re
) || !RREGEXP_SRC_PTR(re
)) {
439 return rb_any_to_s(re
);
441 return rb_reg_desc(RREGEXP_SRC_PTR(re
), RREGEXP_SRC_LEN(re
), re
);
449 * Returns a string containing the regular expression and its options (using the
450 * <code>(?opts:source)</code> notation. This string can be fed back in to
451 * <code>Regexp::new</code> to a regular expression with the same semantics as
452 * the original. (However, <code>Regexp#==</code> may not return true when
453 * comparing the two, as the source of the regular expression itself may
454 * differ, as the example shows). <code>Regexp#inspect</code> produces a
455 * generally more readable version of <i>rxp</i>.
457 * r1 = /ab+c/ix #=> /ab+c/ix
458 * s1 = r1.to_s #=> "(?ix-m:ab+c)"
459 * r2 = Regexp.new(s1) #=> /(?ix-m:ab+c)/
461 * r1.source #=> "ab+c"
462 * r2.source #=> "(?ix-m:ab+c)"
466 rb_reg_to_s(VALUE re
)
469 const int embeddable
= ONIG_OPTION_MULTILINE
|ONIG_OPTION_IGNORECASE
|ONIG_OPTION_EXTEND
;
472 VALUE str
= rb_str_buf_new2("(?");
477 rb_enc_copy(str
, re
);
478 options
= RREGEXP(re
)->ptr
->options
;
479 ptr
= (UChar
*)RREGEXP_SRC_PTR(re
);
480 len
= RREGEXP_SRC_LEN(re
);
482 if (len
>= 4 && ptr
[0] == '(' && ptr
[1] == '?') {
485 if ((len
-= 2) > 0) {
487 opt
= char_to_option((int )*ptr
);
497 if (len
> 1 && *ptr
== '-') {
501 opt
= char_to_option((int )*ptr
);
516 if (*ptr
== ':' && ptr
[len
-1] == ')') {
519 r
= onig_alloc_init(&rp
, ONIG_OPTION_DEFAULT
,
520 ONIGENC_CASE_FOLD_DEFAULT
,
526 err
= (onig_compile(rp
, ptr
, ptr
+ len
, NULL
) != 0);
531 options
= RREGEXP(re
)->ptr
->options
;
532 ptr
= (UChar
*)RREGEXP_SRC_PTR(re
);
533 len
= RREGEXP_SRC_LEN(re
);
537 if (*option_to_str(optbuf
, options
)) rb_str_buf_cat2(str
, optbuf
);
539 if ((options
& embeddable
) != embeddable
) {
541 option_to_str(optbuf
+ 1, ~options
);
542 rb_str_buf_cat2(str
, optbuf
);
545 rb_str_buf_cat2(str
, ":");
546 rb_reg_expr_str(str
, (char*)ptr
, len
);
547 rb_str_buf_cat2(str
, ")");
548 rb_enc_copy(str
, re
);
555 rb_reg_raise(const char *s
, long len
, const char *err
, VALUE re
)
557 VALUE desc
= rb_reg_desc(s
, len
, re
);
559 rb_raise(rb_eRegexpError
, "%s: %s", err
, RSTRING_PTR(desc
));
563 rb_enc_reg_error_desc(const char *s
, long len
, rb_encoding
*enc
, int options
, const char *err
)
566 VALUE desc
= rb_str_buf_new2(err
);
568 rb_enc_associate(desc
, enc
);
569 rb_str_buf_cat2(desc
, ": /");
570 rb_reg_expr_str(desc
, s
, len
);
572 option_to_str(opts
+ 1, options
);
573 rb_str_buf_cat2(desc
, opts
);
574 return rb_exc_new3(rb_eRegexpError
, desc
);
578 rb_enc_reg_raise(const char *s
, long len
, rb_encoding
*enc
, int options
, const char *err
)
580 rb_exc_raise(rb_enc_reg_error_desc(s
, len
, enc
, options
, err
));
584 rb_reg_error_desc(VALUE str
, int options
, const char *err
)
586 return rb_enc_reg_error_desc(RSTRING_PTR(str
), RSTRING_LEN(str
),
587 rb_enc_get(str
), options
, err
);
591 rb_reg_raise_str(VALUE str
, int options
, const char *err
)
593 rb_exc_raise(rb_reg_error_desc(str
, options
, err
));
599 * rxp.casefold? => true or false
601 * Returns the value of the case-insensitive flag.
603 * /a/.casefold? #=> false
604 * /a/i.casefold? #=> true
605 * /(?i:a)/.casefold? #=> false
609 rb_reg_casefold_p(VALUE re
)
612 if (RREGEXP(re
)->ptr
->options
& ONIG_OPTION_IGNORECASE
) return Qtrue
;
619 * rxp.options => fixnum
621 * Returns the set of bits corresponding to the options used when creating this
622 * Regexp (see <code>Regexp::new</code> for details. Note that additional bits
623 * may be set in the returned options: these are used internally by the regular
624 * expression code. These extra bits are ignored if the options are passed to
625 * <code>Regexp::new</code>.
627 * Regexp::IGNORECASE #=> 1
628 * Regexp::EXTENDED #=> 2
629 * Regexp::MULTILINE #=> 4
631 * /cat/.options #=> 0
632 * /cat/ix.options #=> 3
633 * Regexp.new('cat', true).options #=> 1
634 * /\xa1\xa2/e.options #=> 16
637 * Regexp.new(r.source, r.options) #=> /cat/ix
641 rb_reg_options_m(VALUE re
)
643 int options
= rb_reg_options(re
);
644 return INT2NUM(options
);
648 reg_names_iter(const OnigUChar
*name
, const OnigUChar
*name_end
,
649 int back_num
, int *back_refs
, OnigRegex regex
, void *arg
)
651 VALUE ary
= (VALUE
)arg
;
652 rb_ary_push(ary
, rb_str_new((const char *)name
, name_end
-name
));
658 * rxp.names => [name1, name2, ...]
660 * Returns a list of names of captures as an array of strings.
662 * /(?<foo>.)(?<bar>.)(?<baz>.)/.names
663 * #=> ["foo", "bar", "baz"]
665 * /(?<foo>.)(?<foo>.)/.names
673 rb_reg_names(VALUE re
)
675 VALUE ary
= rb_ary_new();
677 onig_foreach_name(RREGEXP(re
)->ptr
, reg_names_iter
, (void*)ary
);
682 reg_named_captures_iter(const OnigUChar
*name
, const OnigUChar
*name_end
,
683 int back_num
, int *back_refs
, OnigRegex regex
, void *arg
)
685 VALUE hash
= (VALUE
)arg
;
686 VALUE ary
= rb_ary_new2(back_num
);
689 for(i
= 0; i
< back_num
; i
++)
690 rb_ary_store(ary
, i
, INT2NUM(back_refs
[i
]));
692 rb_hash_aset(hash
, rb_str_new((const char*)name
, name_end
-name
),ary
);
699 * rxp.named_captures => hash
701 * Returns a hash representing information about named captures of <i>rxp</i>.
703 * A key of the hash is a name of the named captures.
704 * A value of the hash is an array which is list of indexes of corresponding
707 * /(?<foo>.)(?<bar>.)/.named_captures
708 * #=> {"foo"=>[1], "bar"=>[2]}
710 * /(?<foo>.)(?<foo>.)/.named_captures
711 * #=> {"foo"=>[1, 2]}
713 * If there are no named captures, an empty hash is returned.
715 * /(.)(.)/.named_captures
720 rb_reg_named_captures(VALUE re
)
722 VALUE hash
= rb_hash_new();
724 onig_foreach_name(RREGEXP(re
)->ptr
, reg_named_captures_iter
, (void*)hash
);
729 make_regexp(const char *s
, long len
, rb_encoding
*enc
, int flags
, onig_errmsg_buffer err
)
735 /* Handle escaped characters first. */
737 /* Build a copy of the string (in dest) with the
738 escaped characters translated, and generate the regex
742 r
= onig_alloc_init(&rp
, flags
, ONIGENC_CASE_FOLD_DEFAULT
,
743 enc
, OnigDefaultSyntax
);
745 onig_error_code_to_str((UChar
*)err
, r
);
749 r
= onig_compile(rp
, (UChar
*)s
, (UChar
*)(s
+ len
), &einfo
);
753 (void )onig_error_code_to_str((UChar
*)err
, r
, &einfo
);
761 * Document-class: MatchData
763 * <code>MatchData</code> is the type of the special variable <code>$~</code>,
764 * and is the type of the object returned by <code>Regexp#match</code> and
765 * <code>Regexp.last_match</code>. It encapsulates all the results of a pattern
766 * match, results normally accessed through the special variables
767 * <code>$&</code>, <code>$'</code>, <code>$`</code>, <code>$1</code>,
768 * <code>$2</code>, and so on.
775 match_alloc(VALUE klass
)
777 NEWOBJ(match
, struct RMatch
);
778 OBJSETUP(match
, klass
, T_MATCH
);
783 match
->rmatch
= ALLOC(struct rmatch
);
784 MEMZERO(match
->rmatch
, struct rmatch
, 1);
795 pair_byte_cmp(const void *pair1
, const void *pair2
)
797 return ((pair_t
*)pair1
)->byte_pos
- ((pair_t
*)pair2
)->byte_pos
;
801 update_char_offset(VALUE match
)
803 struct rmatch
*rm
= RMATCH(match
)->rmatch
;
804 struct re_registers
*regs
;
811 if (rm
->char_offset_updated
)
815 num_regs
= rm
->regs
.num_regs
;
817 if (rm
->char_offset_num_allocated
< num_regs
) {
818 REALLOC_N(rm
->char_offset
, struct rmatch_offset
, num_regs
);
819 rm
->char_offset_num_allocated
= num_regs
;
822 enc
= rb_enc_get(RMATCH(match
)->str
);
823 if (rb_enc_mbmaxlen(enc
) == 1) {
824 for (i
= 0; i
< num_regs
; i
++) {
825 rm
->char_offset
[i
].beg
= BEG(i
);
826 rm
->char_offset
[i
].end
= END(i
);
828 rm
->char_offset_updated
= 1;
832 pairs
= ALLOCA_N(pair_t
, num_regs
*2);
834 for (i
= 0; i
< num_regs
; i
++) {
837 pairs
[num_pos
++].byte_pos
= BEG(i
);
838 pairs
[num_pos
++].byte_pos
= END(i
);
840 qsort(pairs
, num_pos
, sizeof(pair_t
), pair_byte_cmp
);
842 s
= p
= RSTRING_PTR(RMATCH(match
)->str
);
843 e
= s
+ RSTRING_LEN(RMATCH(match
)->str
);
845 for (i
= 0; i
< num_pos
; i
++) {
846 q
= s
+ pairs
[i
].byte_pos
;
847 c
+= rb_enc_strlen(p
, q
, enc
);
848 pairs
[i
].char_pos
= c
;
852 for (i
= 0; i
< num_regs
; i
++) {
855 rm
->char_offset
[i
].beg
= -1;
856 rm
->char_offset
[i
].end
= -1;
860 key
.byte_pos
= BEG(i
);
861 found
= bsearch(&key
, pairs
, num_pos
, sizeof(pair_t
), pair_byte_cmp
);
862 rm
->char_offset
[i
].beg
= found
->char_pos
;
864 key
.byte_pos
= END(i
);
865 found
= bsearch(&key
, pairs
, num_pos
, sizeof(pair_t
), pair_byte_cmp
);
866 rm
->char_offset
[i
].end
= found
->char_pos
;
869 rm
->char_offset_updated
= 1;
873 match_check(VALUE match
)
875 if (!RMATCH(match
)->regexp
) {
876 rb_raise(rb_eTypeError
, "uninitialized Match");
882 match_init_copy(VALUE obj
, VALUE orig
)
886 if (obj
== orig
) return obj
;
888 if (!rb_obj_is_instance_of(orig
, rb_obj_class(obj
))) {
889 rb_raise(rb_eTypeError
, "wrong argument class");
891 RMATCH(obj
)->str
= RMATCH(orig
)->str
;
892 RMATCH(obj
)->regexp
= RMATCH(orig
)->regexp
;
894 rm
= RMATCH(obj
)->rmatch
;
895 onig_region_copy(&rm
->regs
, RMATCH_REGS(orig
));
897 if (!RMATCH(orig
)->rmatch
->char_offset_updated
) {
898 rm
->char_offset_updated
= 0;
901 if (rm
->char_offset_num_allocated
< rm
->regs
.num_regs
) {
902 REALLOC_N(rm
->char_offset
, struct rmatch_offset
, rm
->regs
.num_regs
);
903 rm
->char_offset_num_allocated
= rm
->regs
.num_regs
;
905 MEMCPY(rm
->char_offset
, RMATCH(orig
)->rmatch
->char_offset
,
906 struct rmatch_offset
, rm
->regs
.num_regs
);
907 rm
->char_offset_updated
= 1;
916 * mtch.regexp => regexp
918 * Returns the regexp.
920 * m = /a.*b/.match("abc")
921 * m.regexp #=> /a.*b/
925 match_regexp(VALUE match
)
928 return RMATCH(match
)->regexp
;
933 * mtch.names => [name1, name2, ...]
935 * Returns a list of names of captures as an array of strings.
936 * It is same as mtch.regexp.names.
938 * /(?<foo>.)(?<bar>.)(?<baz>.)/.match("hoge").names
939 * #=> ["foo", "bar", "baz"]
941 * m = /(?<x>.)(?<y>.)?/.match("a") #=> #<MatchData "a" x:"a" y:nil>
942 * m.names #=> ["x", "y"]
946 match_names(VALUE match
)
949 return rb_reg_names(RMATCH(match
)->regexp
);
954 * mtch.length => integer
955 * mtch.size => integer
957 * Returns the number of elements in the match array.
959 * m = /(.)(.)(\d+)(\d)/.match("THX1138.")
965 match_size(VALUE match
)
968 return INT2FIX(RMATCH_REGS(match
)->num_regs
);
972 match_backref_number(VALUE match
, VALUE backref
)
977 struct re_registers
*regs
= RMATCH_REGS(match
);
978 VALUE regexp
= RMATCH(match
)->regexp
;
981 switch(TYPE(backref
)) {
983 return NUM2INT(backref
);
986 name
= rb_id2name(SYM2ID(backref
));
990 name
= StringValueCStr(backref
);
994 num
= onig_name_to_backref_number(RREGEXP(regexp
)->ptr
,
995 (const unsigned char*)name
,
996 (const unsigned char*)name
+ strlen(name
),
1000 rb_raise(rb_eIndexError
, "undefined group name reference: %s", name
);
1009 * mtch.offset(n) => array
1011 * Returns a two-element array containing the beginning and ending offsets of
1012 * the <em>n</em>th match.
1013 * <em>n</em> can be a string or symbol to reference a named capture.
1015 * m = /(.)(.)(\d+)(\d)/.match("THX1138.")
1016 * m.offset(0) #=> [1, 7]
1017 * m.offset(4) #=> [6, 7]
1019 * m = /(?<foo>.)(.)(?<bar>.)/.match("hoge")
1020 * p m.offset(:foo) #=> [0, 1]
1021 * p m.offset(:bar) #=> [2, 3]
1026 match_offset(VALUE match
, VALUE n
)
1028 int i
= match_backref_number(match
, n
);
1029 struct re_registers
*regs
= RMATCH_REGS(match
);
1032 if (i
< 0 || regs
->num_regs
<= i
)
1033 rb_raise(rb_eIndexError
, "index %d out of matches", i
);
1036 return rb_assoc_new(Qnil
, Qnil
);
1038 update_char_offset(match
);
1039 return rb_assoc_new(INT2FIX(RMATCH(match
)->rmatch
->char_offset
[i
].beg
),
1040 INT2FIX(RMATCH(match
)->rmatch
->char_offset
[i
].end
));
1046 * mtch.begin(n) => integer
1048 * Returns the offset of the start of the <em>n</em>th element of the match
1049 * array in the string.
1050 * <em>n</em> can be a string or symbol to reference a named capture.
1052 * m = /(.)(.)(\d+)(\d)/.match("THX1138.")
1056 * m = /(?<foo>.)(.)(?<bar>.)/.match("hoge")
1057 * p m.begin(:foo) #=> 0
1058 * p m.begin(:bar) #=> 2
1062 match_begin(VALUE match
, VALUE n
)
1064 int i
= match_backref_number(match
, n
);
1065 struct re_registers
*regs
= RMATCH_REGS(match
);
1068 if (i
< 0 || regs
->num_regs
<= i
)
1069 rb_raise(rb_eIndexError
, "index %d out of matches", i
);
1074 update_char_offset(match
);
1075 return INT2FIX(RMATCH(match
)->rmatch
->char_offset
[i
].beg
);
1081 * mtch.end(n) => integer
1083 * Returns the offset of the character immediately following the end of the
1084 * <em>n</em>th element of the match array in the string.
1085 * <em>n</em> can be a string or symbol to reference a named capture.
1087 * m = /(.)(.)(\d+)(\d)/.match("THX1138.")
1091 * m = /(?<foo>.)(.)(?<bar>.)/.match("hoge")
1092 * p m.end(:foo) #=> 1
1093 * p m.end(:bar) #=> 3
1097 match_end(VALUE match
, VALUE n
)
1099 int i
= match_backref_number(match
, n
);
1100 struct re_registers
*regs
= RMATCH_REGS(match
);
1103 if (i
< 0 || regs
->num_regs
<= i
)
1104 rb_raise(rb_eIndexError
, "index %d out of matches", i
);
1109 update_char_offset(match
);
1110 return INT2FIX(RMATCH(match
)->rmatch
->char_offset
[i
].end
);
1113 #define MATCH_BUSY FL_USER2
1116 rb_match_busy(VALUE match
)
1118 FL_SET(match
, MATCH_BUSY
);
1123 * rxp.fixed_encoding? => true or false
1125 * Returns false if rxp is applicable to
1126 * a string with any ASCII compatible encoding.
1127 * Returns true otherwise.
1130 * r.fixed_encoding? #=> false
1131 * r =~ "\u{6666} a" #=> 2
1132 * r =~ "\xa1\xa2 a".force_encoding("euc-jp") #=> 2
1133 * r =~ "abc".force_encoding("euc-jp") #=> 0
1136 * r.fixed_encoding? #=> true
1137 * r.encoding #=> #<Encoding:UTF-8>
1138 * r =~ "\u{6666} a" #=> 2
1139 * r =~ "\xa1\xa2".force_encoding("euc-jp") #=> ArgumentError
1140 * r =~ "abc".force_encoding("euc-jp") #=> 0
1143 * r.fixed_encoding? #=> true
1144 * r.encoding #=> #<Encoding:UTF-8>
1145 * r =~ "\u{6666} a" #=> 0
1146 * r =~ "\xa1\xa2".force_encoding("euc-jp") #=> ArgumentError
1147 * r =~ "abc".force_encoding("euc-jp") #=> nil
1151 rb_reg_fixed_encoding_p(VALUE re
)
1153 if (FL_TEST(re
, KCODE_FIXED
))
1160 rb_reg_preprocess(const char *p
, const char *end
, rb_encoding
*enc
,
1161 rb_encoding
**fixed_enc
, onig_errmsg_buffer err
);
1165 reg_enc_error(VALUE re
, VALUE str
)
1167 rb_raise(rb_eArgError
,
1168 "incompatible encoding regexp match (%s regexp with %s string)",
1169 rb_enc_name(RREGEXP(re
)->ptr
->enc
),
1170 rb_enc_name(rb_enc_get(str
)));
1174 rb_reg_prepare_enc(VALUE re
, VALUE str
, int warn
)
1176 rb_encoding
*enc
= 0;
1178 if (rb_enc_str_coderange(str
) == ENC_CODERANGE_BROKEN
) {
1179 rb_raise(rb_eArgError
,
1181 rb_enc_name(rb_enc_get(str
)));
1185 enc
= rb_enc_get(str
);
1186 if (!rb_enc_str_asciicompat_p(str
)) {
1187 if (RREGEXP(re
)->ptr
->enc
!= enc
) {
1188 reg_enc_error(re
, str
);
1191 else if (rb_reg_fixed_encoding_p(re
)) {
1192 if (RREGEXP(re
)->ptr
->enc
!= enc
&&
1193 (!rb_enc_asciicompat(RREGEXP(re
)->ptr
->enc
) ||
1194 rb_enc_str_coderange(str
) != ENC_CODERANGE_7BIT
)) {
1195 reg_enc_error(re
, str
);
1197 enc
= RREGEXP(re
)->ptr
->enc
;
1199 if (warn
&& (RBASIC(re
)->flags
& REG_ENCODING_NONE
) &&
1200 enc
!= rb_ascii8bit_encoding() &&
1201 rb_enc_str_coderange(str
) != ENC_CODERANGE_7BIT
) {
1202 rb_warn("regexp match /.../n against to %s string",
1209 rb_reg_prepare_re(VALUE re
, VALUE str
)
1211 regex_t
*reg
= RREGEXP(re
)->ptr
;
1212 onig_errmsg_buffer err
= "";
1214 OnigErrorInfo einfo
;
1215 const char *pattern
;
1217 rb_encoding
*fixed_enc
= 0;
1218 rb_encoding
*enc
= rb_reg_prepare_enc(re
, str
, 1);
1220 if (reg
->enc
== enc
) return reg
;
1223 reg
= RREGEXP(re
)->ptr
;
1224 pattern
= RREGEXP_SRC_PTR(re
);
1226 unescaped
= rb_reg_preprocess(
1227 pattern
, pattern
+ RREGEXP_SRC_LEN(re
), enc
,
1230 if (unescaped
== Qnil
) {
1231 rb_raise(rb_eArgError
, "regexp preprocess failed: %s", err
);
1234 r
= onig_new(®
, (UChar
* )RSTRING_PTR(unescaped
),
1235 (UChar
* )(RSTRING_PTR(unescaped
) + RSTRING_LEN(unescaped
)),
1237 OnigDefaultSyntax
, &einfo
);
1239 onig_error_code_to_str((UChar
*)err
, r
, &einfo
);
1240 rb_reg_raise(pattern
, RREGEXP_SRC_LEN(re
), err
, re
);
1243 RB_GC_GUARD(unescaped
);
1248 rb_reg_adjust_startpos(VALUE re
, VALUE str
, int pos
, int reverse
)
1254 enc
= rb_reg_prepare_enc(re
, str
, 0);
1260 range
= RSTRING_LEN(str
) - pos
;
1263 if (pos
> 0 && ONIGENC_MBC_MAXLEN(enc
) != 1 && pos
< RSTRING_LEN(str
)) {
1264 string
= (UChar
*)RSTRING_PTR(str
);
1267 p
= onigenc_get_right_adjust_char_head(enc
, string
, string
+ pos
);
1270 p
= ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc
, string
, string
+ pos
);
1279 rb_reg_search(VALUE re
, VALUE str
, int pos
, int reverse
)
1283 struct re_registers regi
, *regs
= ®i
;
1284 char *range
= RSTRING_PTR(str
);
1288 if (pos
> RSTRING_LEN(str
) || pos
< 0) {
1289 rb_backref_set(Qnil
);
1293 reg
= rb_reg_prepare_re(re
, str
);
1294 tmpreg
= reg
!= RREGEXP(re
)->ptr
;
1295 if (!tmpreg
) RREGEXP(re
)->usecnt
++;
1297 match
= rb_backref_get();
1298 if (!NIL_P(match
)) {
1299 if (FL_TEST(match
, MATCH_BUSY
)) {
1303 regs
= RMATCH_REGS(match
);
1307 MEMZERO(regs
, struct re_registers
, 1);
1310 range
+= RSTRING_LEN(str
);
1312 result
= onig_search(reg
,
1313 (UChar
*)(RSTRING_PTR(str
)),
1314 ((UChar
*)(RSTRING_PTR(str
)) + RSTRING_LEN(str
)),
1315 ((UChar
*)(RSTRING_PTR(str
)) + pos
),
1317 regs
, ONIG_OPTION_NONE
);
1318 if (!tmpreg
) RREGEXP(re
)->usecnt
--;
1320 if (RREGEXP(re
)->usecnt
) {
1324 onig_free(RREGEXP(re
)->ptr
);
1325 RREGEXP(re
)->ptr
= reg
;
1330 onig_region_free(regs
, 0);
1331 if (result
== ONIG_MISMATCH
) {
1332 rb_backref_set(Qnil
);
1336 onig_errmsg_buffer err
= "";
1337 onig_error_code_to_str((UChar
*)err
, result
);
1338 rb_reg_raise(RREGEXP_SRC_PTR(re
), RREGEXP_SRC_LEN(re
), err
, 0);
1343 match
= match_alloc(rb_cMatch
);
1344 onig_region_copy(RMATCH_REGS(match
), regs
);
1345 onig_region_free(regs
, 0);
1348 if (rb_safe_level() >= 3)
1351 FL_UNSET(match
, FL_TAINT
);
1354 RMATCH(match
)->str
= rb_str_new4(str
);
1355 RMATCH(match
)->regexp
= re
;
1356 RMATCH(match
)->rmatch
->char_offset_updated
= 0;
1357 rb_backref_set(match
);
1359 OBJ_INFECT(match
, re
);
1360 OBJ_INFECT(match
, str
);
1366 rb_reg_nth_defined(int nth
, VALUE match
)
1368 struct re_registers
*regs
;
1369 if (NIL_P(match
)) return Qnil
;
1371 regs
= RMATCH_REGS(match
);
1372 if (nth
>= regs
->num_regs
) {
1376 nth
+= regs
->num_regs
;
1377 if (nth
<= 0) return Qnil
;
1379 if (BEG(nth
) == -1) return Qfalse
;
1384 rb_reg_nth_match(int nth
, VALUE match
)
1387 long start
, end
, len
;
1388 struct re_registers
*regs
;
1390 if (NIL_P(match
)) return Qnil
;
1392 regs
= RMATCH_REGS(match
);
1393 if (nth
>= regs
->num_regs
) {
1397 nth
+= regs
->num_regs
;
1398 if (nth
<= 0) return Qnil
;
1401 if (start
== -1) return Qnil
;
1404 str
= rb_str_subseq(RMATCH(match
)->str
, start
, len
);
1405 OBJ_INFECT(str
, match
);
1410 rb_reg_last_match(VALUE match
)
1412 return rb_reg_nth_match(0, match
);
1418 * mtch.pre_match => str
1420 * Returns the portion of the original string before the current match.
1421 * Equivalent to the special variable <code>$`</code>.
1423 * m = /(.)(.)(\d+)(\d)/.match("THX1138.")
1424 * m.pre_match #=> "T"
1428 rb_reg_match_pre(VALUE match
)
1431 struct re_registers
*regs
;
1433 if (NIL_P(match
)) return Qnil
;
1435 regs
= RMATCH_REGS(match
);
1436 if (BEG(0) == -1) return Qnil
;
1437 str
= rb_str_subseq(RMATCH(match
)->str
, 0, BEG(0));
1438 if (OBJ_TAINTED(match
)) OBJ_TAINT(str
);
1445 * mtch.post_match => str
1447 * Returns the portion of the original string after the current match.
1448 * Equivalent to the special variable <code>$'</code>.
1450 * m = /(.)(.)(\d+)(\d)/.match("THX1138: The Movie")
1451 * m.post_match #=> ": The Movie"
1455 rb_reg_match_post(VALUE match
)
1459 struct re_registers
*regs
;
1461 if (NIL_P(match
)) return Qnil
;
1463 regs
= RMATCH_REGS(match
);
1464 if (BEG(0) == -1) return Qnil
;
1465 str
= RMATCH(match
)->str
;
1467 str
= rb_str_subseq(str
, pos
, RSTRING_LEN(str
) - pos
);
1468 if (OBJ_TAINTED(match
)) OBJ_TAINT(str
);
1473 rb_reg_match_last(VALUE match
)
1476 struct re_registers
*regs
;
1478 if (NIL_P(match
)) return Qnil
;
1480 regs
= RMATCH_REGS(match
);
1481 if (BEG(0) == -1) return Qnil
;
1483 for (i
=regs
->num_regs
-1; BEG(i
) == -1 && i
> 0; i
--)
1485 if (i
== 0) return Qnil
;
1486 return rb_reg_nth_match(i
, match
);
1490 last_match_getter(void)
1492 return rb_reg_last_match(rb_backref_get());
1496 prematch_getter(void)
1498 return rb_reg_match_pre(rb_backref_get());
1502 postmatch_getter(void)
1504 return rb_reg_match_post(rb_backref_get());
1508 last_paren_match_getter(void)
1510 return rb_reg_match_last(rb_backref_get());
1514 match_array(VALUE match
, int start
)
1516 struct re_registers
*regs
= RMATCH_REGS(match
);
1517 VALUE ary
= rb_ary_new2(regs
->num_regs
);
1518 VALUE target
= RMATCH(match
)->str
;
1520 int taint
= OBJ_TAINTED(match
);
1523 for (i
=start
; i
<regs
->num_regs
; i
++) {
1524 if (regs
->beg
[i
] == -1) {
1525 rb_ary_push(ary
, Qnil
);
1528 VALUE str
= rb_str_subseq(target
, regs
->beg
[i
], regs
->end
[i
]-regs
->beg
[i
]);
1529 if (taint
) OBJ_TAINT(str
);
1530 rb_ary_push(ary
, str
);
1537 /* [MG]:FIXME: I put parens around the /.../.match() in the first line of the
1538 second example to prevent the '*' followed by a '/' from ending the
1543 * mtch.to_a => anArray
1545 * Returns the array of matches.
1547 * m = /(.)(.)(\d+)(\d)/.match("THX1138.")
1548 * m.to_a #=> ["HX1138", "H", "X", "113", "8"]
1550 * Because <code>to_a</code> is called when expanding
1551 * <code>*</code><em>variable</em>, there's a useful assignment
1552 * shortcut for extracting matched fields. This is slightly slower than
1553 * accessing the fields directly (as an intermediate array is
1556 * all,f1,f2,f3 = *(/(.)(.)(\d+)(\d)/.match("THX1138."))
1564 match_to_a(VALUE match
)
1566 return match_array(match
, 0);
1572 * mtch.captures => array
1574 * Returns the array of captures; equivalent to <code>mtch.to_a[1..-1]</code>.
1576 * f1,f2,f3,f4 = /(.)(.)(\d+)(\d)/.match("THX1138.").captures
1583 match_captures(VALUE match
)
1585 return match_array(match
, 1);
1589 name_to_backref_number(struct re_registers
*regs
, VALUE regexp
, const char* name
, const char* name_end
)
1593 num
= onig_name_to_backref_number(RREGEXP(regexp
)->ptr
,
1594 (const unsigned char* )name
, (const unsigned char* )name_end
, regs
);
1599 VALUE s
= rb_str_new(name
, (long )(name_end
- name
));
1600 rb_raise(rb_eIndexError
, "undefined group name reference: %s",
1607 * mtch[i] => str or nil
1608 * mtch[start, length] => array
1609 * mtch[range] => array
1610 * mtch[name] => str or nil
1612 * Match Reference---<code>MatchData</code> acts as an array, and may be
1613 * accessed using the normal array indexing techniques. <i>mtch</i>[0] is
1614 * equivalent to the special variable <code>$&</code>, and returns the entire
1615 * matched string. <i>mtch</i>[1], <i>mtch</i>[2], and so on return the values
1616 * of the matched backreferences (portions of the pattern between parentheses).
1618 * m = /(.)(.)(\d+)(\d)/.match("THX1138.")
1619 * m #=> #<MatchData "HX1138" 1:"H" 2:"X" 3:"113" 4:"8">
1621 * m[1, 2] #=> ["H", "X"]
1622 * m[1..3] #=> ["H", "X", "113"]
1623 * m[-3, 2] #=> ["X", "113"]
1625 * m = /(?<foo>a+)b/.match("ccaaab")
1626 * m #=> #<MatchData "aaab" foo:"aaa">
1627 * m["foo"] #=> "aaa"
1632 match_aref(int argc
, VALUE
*argv
, VALUE match
)
1637 rb_scan_args(argc
, argv
, "11", &idx
, &rest
);
1640 if (FIXNUM_P(idx
)) {
1641 if (FIX2INT(idx
) >= 0) {
1642 return rb_reg_nth_match(FIX2INT(idx
), match
);
1649 switch (TYPE(idx
)) {
1651 p
= rb_id2name(SYM2ID(idx
));
1652 goto name_to_backref
;
1655 p
= StringValuePtr(idx
);
1658 num
= name_to_backref_number(RMATCH_REGS(match
),
1659 RMATCH(match
)->regexp
, p
, p
+ strlen(p
));
1660 return rb_reg_nth_match(num
, match
);
1669 return rb_ary_aref(argc
, argv
, match_to_a(match
));
1673 match_entry(VALUE match
, long n
)
1675 return rb_reg_nth_match(n
, match
);
1682 * mtch.values_at([index]*) => array
1684 * Uses each <i>index</i> to access the matching values, returning an array of
1685 * the corresponding matches.
1687 * m = /(.)(.)(\d+)(\d)/.match("THX1138: The Movie")
1688 * m.to_a #=> ["HX1138", "H", "X", "113", "8"]
1689 * m.values_at(0, 2, -2) #=> ["HX1138", "X", "113"]
1693 match_values_at(int argc
, VALUE
*argv
, VALUE match
)
1695 struct re_registers
*regs
= RMATCH_REGS(match
);
1697 return rb_get_values_at(match
, regs
->num_regs
, argc
, argv
, match_entry
);
1705 * Returns the entire matched string.
1707 * m = /(.)(.)(\d+)(\d)/.match("THX1138.")
1708 * m.to_s #=> "HX1138"
1712 match_to_s(VALUE match
)
1714 VALUE str
= rb_reg_last_match(match
);
1717 if (NIL_P(str
)) str
= rb_str_new(0,0);
1718 if (OBJ_TAINTED(match
)) OBJ_TAINT(str
);
1719 if (OBJ_TAINTED(RMATCH(match
)->str
)) OBJ_TAINT(str
);
1726 * mtch.string => str
1728 * Returns a frozen copy of the string passed in to <code>match</code>.
1730 * m = /(.)(.)(\d+)(\d)/.match("THX1138.")
1731 * m.string #=> "THX1138."
1735 match_string(VALUE match
)
1738 return RMATCH(match
)->str
; /* str is frozen */
1741 struct backref_name_tag
{
1747 match_inspect_name_iter(const OnigUChar
*name
, const OnigUChar
*name_end
,
1748 int back_num
, int *back_refs
, OnigRegex regex
, void *arg0
)
1750 struct backref_name_tag
*arg
= (struct backref_name_tag
*)arg0
;
1753 for (i
= 0; i
< back_num
; i
++) {
1754 arg
[back_refs
[i
]].name
= name
;
1755 arg
[back_refs
[i
]].len
= name_end
- name
;
1762 * mtch.inspect => str
1764 * Returns a printable version of <i>mtch</i>.
1766 * puts /.$/.match("foo").inspect
1767 * #=> #<MatchData "o">
1769 * puts /(.)(.)(.)/.match("foo").inspect
1770 * #=> #<MatchData "foo" 1:"f" 2:"o" 3:"o">
1772 * puts /(.)(.)?(.)/.match("fo").inspect
1773 * #=> #<MatchData "fo" 1:"f" 2:nil 3:"o">
1775 * puts /(?<foo>.)(?<bar>.)(?<baz>.)/.match("hoge").inspect
1776 * #=> #<MatchData "hog" foo:"h" bar:"o" baz:"g">
1781 match_inspect(VALUE match
)
1783 const char *cname
= rb_obj_classname(match
);
1786 struct re_registers
*regs
= RMATCH_REGS(match
);
1787 int num_regs
= regs
->num_regs
;
1788 struct backref_name_tag
*names
;
1789 VALUE regexp
= RMATCH(match
)->regexp
;
1792 return rb_sprintf("#<%s:%p>", cname
, (void*)match
);
1795 names
= ALLOCA_N(struct backref_name_tag
, num_regs
);
1796 MEMZERO(names
, struct backref_name_tag
, num_regs
);
1798 onig_foreach_name(RREGEXP(regexp
)->ptr
,
1799 match_inspect_name_iter
, names
);
1801 str
= rb_str_buf_new2("#<");
1802 rb_str_buf_cat2(str
, cname
);
1804 for (i
= 0; i
< num_regs
; i
++) {
1806 rb_str_buf_cat2(str
, " ");
1809 rb_str_buf_cat(str
, (const char *)names
[i
].name
, names
[i
].len
);
1811 rb_str_catf(str
, "%d", i
);
1813 rb_str_buf_cat2(str
, ":");
1815 v
= rb_reg_nth_match(i
, match
);
1817 rb_str_buf_cat2(str
, "nil");
1819 rb_str_buf_append(str
, rb_str_inspect(v
));
1821 rb_str_buf_cat2(str
, ">");
1829 read_escaped_byte(const char **pp
, const char *end
, onig_errmsg_buffer err
)
1831 const char *p
= *pp
;
1833 int meta_prefix
= 0, ctrl_prefix
= 0;
1838 if (p
== end
|| *p
++ != '\\') {
1839 strcpy(err
, "too short escaped multibyte character");
1845 strcpy(err
, "too short escape sequence");
1849 case '\\': code
= '\\'; break;
1850 case 'n': code
= '\n'; break;
1851 case 't': code
= '\t'; break;
1852 case 'r': code
= '\r'; break;
1853 case 'f': code
= '\f'; break;
1854 case 'v': code
= '\013'; break;
1855 case 'a': code
= '\007'; break;
1856 case 'e': code
= '\033'; break;
1859 case '0': case '1': case '2': case '3':
1860 case '4': case '5': case '6': case '7':
1862 code
= ruby_scan_oct(p
, end
< p
+3 ? end
-p
: 3, &len
);
1866 case 'x': /* \xHH */
1867 code
= ruby_scan_hex(p
, end
< p
+2 ? end
-p
: 2, &len
);
1869 strcpy(err
, "invalid hex escape");
1875 case 'M': /* \M-X, \M-\C-X, \M-\cX */
1877 strcpy(err
, "duplicate meta escape");
1881 if (p
+1 < end
&& *p
++ == '-' && (*p
& 0x80) == 0) {
1891 strcpy(err
, "too short meta escape");
1894 case 'C': /* \C-X, \C-\M-X */
1895 if (p
== end
|| *p
++ != '-') {
1896 strcpy(err
, "too short control escape");
1899 case 'c': /* \cX, \c\M-X */
1901 strcpy(err
, "duplicate control escape");
1905 if (p
< end
&& (*p
& 0x80) == 0) {
1915 strcpy(err
, "too short control escape");
1919 strcpy(err
, "unexpected escape sequence");
1922 if (code
< 0 || 0xff < code
) {
1923 strcpy(err
, "invalid escape code");
1937 unescape_escaped_nonascii(const char **pp
, const char *end
, rb_encoding
*enc
,
1938 VALUE buf
, rb_encoding
**encp
, onig_errmsg_buffer err
)
1940 const char *p
= *pp
;
1941 int chmaxlen
= rb_enc_mbmaxlen(enc
);
1942 char *chbuf
= ALLOCA_N(char, chmaxlen
);
1947 memset(chbuf
, 0, chmaxlen
);
1949 byte
= read_escaped_byte(&p
, end
, err
);
1954 chbuf
[chlen
++] = byte
;
1955 while (chlen
< chmaxlen
&&
1956 MBCLEN_NEEDMORE_P(rb_enc_precise_mbclen(chbuf
, chbuf
+chlen
, enc
))) {
1957 byte
= read_escaped_byte(&p
, end
, err
);
1961 chbuf
[chlen
++] = byte
;
1964 l
= rb_enc_precise_mbclen(chbuf
, chbuf
+chlen
, enc
);
1965 if (MBCLEN_INVALID_P(l
)) {
1966 strcpy(err
, "invalid multibyte escape");
1969 if (1 < chlen
|| (chbuf
[0] & 0x80)) {
1970 rb_str_buf_cat(buf
, chbuf
, chlen
);
1974 else if (*encp
!= enc
) {
1975 strcpy(err
, "escaped non ASCII character in UTF-8 regexp");
1981 snprintf(escbuf
, sizeof(escbuf
), "\\x%02X", chbuf
[0]&0xff);
1982 rb_str_buf_cat(buf
, escbuf
, 4);
1989 check_unicode_range(unsigned long code
, onig_errmsg_buffer err
)
1991 if ((0xd800 <= code
&& code
<= 0xdfff) || /* Surrogates */
1993 strcpy(err
, "invalid Unicode range");
2000 append_utf8(unsigned long uv
,
2001 VALUE buf
, rb_encoding
**encp
, onig_errmsg_buffer err
)
2003 if (check_unicode_range(uv
, err
) != 0)
2007 snprintf(escbuf
, sizeof(escbuf
), "\\x%02X", (int)uv
);
2008 rb_str_buf_cat(buf
, escbuf
, 4);
2013 len
= rb_uv_to_utf8(utf8buf
, uv
);
2014 rb_str_buf_cat(buf
, utf8buf
, len
);
2017 *encp
= rb_utf8_encoding();
2018 else if (*encp
!= rb_utf8_encoding()) {
2019 strcpy(err
, "UTF-8 character in non UTF-8 regexp");
2027 unescape_unicode_list(const char **pp
, const char *end
,
2028 VALUE buf
, rb_encoding
**encp
, onig_errmsg_buffer err
)
2030 const char *p
= *pp
;
2031 int has_unicode
= 0;
2035 while (p
< end
&& ISSPACE(*p
)) p
++;
2038 code
= ruby_scan_hex(p
, end
-p
, &len
);
2041 if (6 < len
) { /* max 10FFFF */
2042 strcpy(err
, "invalid Unicode range");
2046 if (append_utf8(code
, buf
, encp
, err
) != 0)
2050 while (p
< end
&& ISSPACE(*p
)) p
++;
2053 if (has_unicode
== 0) {
2054 strcpy(err
, "invalid Unicode list");
2064 unescape_unicode_bmp(const char **pp
, const char *end
,
2065 VALUE buf
, rb_encoding
**encp
, onig_errmsg_buffer err
)
2067 const char *p
= *pp
;
2072 strcpy(err
, "invalid Unicode escape");
2075 code
= ruby_scan_hex(p
, 4, &len
);
2077 strcpy(err
, "invalid Unicode escape");
2080 if (append_utf8(code
, buf
, encp
, err
) != 0)
2087 unescape_nonascii(const char *p
, const char *end
, rb_encoding
*enc
,
2088 VALUE buf
, rb_encoding
**encp
, int *has_property
,
2089 onig_errmsg_buffer err
)
2095 int chlen
= rb_enc_precise_mbclen(p
, end
, enc
);
2096 if (!MBCLEN_CHARFOUND_P(chlen
)) {
2097 strcpy(err
, "invalid multibyte character");
2100 chlen
= MBCLEN_CHARFOUND_LEN(chlen
);
2101 if (1 < chlen
|| (*p
& 0x80)) {
2102 rb_str_buf_cat(buf
, p
, chlen
);
2106 else if (*encp
!= enc
) {
2107 strcpy(err
, "non ASCII character in UTF-8 regexp");
2116 strcpy(err
, "too short escape sequence");
2120 case '1': case '2': case '3':
2121 case '4': case '5': case '6': case '7': /* \O, \OO, \OOO or backref */
2124 if (ruby_scan_oct(p
-1, end
-(p
-1), &octlen
) <= 0177) {
2125 /* backref or 7bit octal.
2126 no need to unescape anyway.
2127 re-escaping may break backref */
2131 /* xxx: How about more than 199 subexpressions? */
2133 case '0': /* \0, \0O, \0OO */
2135 case 'x': /* \xHH */
2136 case 'c': /* \cX, \c\M-X */
2137 case 'C': /* \C-X, \C-\M-X */
2138 case 'M': /* \M-X, \M-\C-X, \M-\cX */
2140 if (unescape_escaped_nonascii(&p
, end
, enc
, buf
, encp
, err
) != 0)
2146 strcpy(err
, "too short escape sequence");
2150 /* \u{H HH HHH HHHH HHHHH HHHHHH ...} */
2152 if (unescape_unicode_list(&p
, end
, buf
, encp
, err
) != 0)
2154 if (p
== end
|| *p
++ != '}') {
2155 strcpy(err
, "invalid Unicode list");
2162 if (unescape_unicode_bmp(&p
, end
, buf
, encp
, err
) != 0)
2167 case 'p': /* \p{Hiragana} */
2173 default: /* \n, \\, \d, \9, etc. */
2177 rb_str_buf_cat(buf
, smallbuf
, 2);
2183 rb_str_buf_cat(buf
, &c
, 1);
2192 rb_reg_preprocess(const char *p
, const char *end
, rb_encoding
*enc
,
2193 rb_encoding
**fixed_enc
, onig_errmsg_buffer err
)
2196 int has_property
= 0;
2198 buf
= rb_str_buf_new(0);
2200 if (rb_enc_asciicompat(enc
))
2204 rb_enc_associate(buf
, enc
);
2207 if (unescape_nonascii(p
, end
, enc
, buf
, fixed_enc
, &has_property
, err
) != 0)
2210 if (has_property
&& !*fixed_enc
) {
2215 rb_enc_associate(buf
, *fixed_enc
);
2222 rb_reg_check_preprocess(VALUE str
)
2224 rb_encoding
*fixed_enc
= 0;
2225 onig_errmsg_buffer err
= "";
2231 p
= RSTRING_PTR(str
);
2232 end
= p
+ RSTRING_LEN(str
);
2233 enc
= rb_enc_get(str
);
2235 buf
= rb_reg_preprocess(p
, end
, enc
, &fixed_enc
, err
);
2239 return rb_reg_error_desc(str
, 0, err
);
2245 rb_reg_preprocess_dregexp(VALUE ary
)
2247 rb_encoding
*fixed_enc
= 0;
2248 rb_encoding
*regexp_enc
= 0;
2249 onig_errmsg_buffer err
= "";
2252 int argc
= RARRAY_LEN(ary
);
2253 VALUE
*argv
= RARRAY_PTR(ary
);
2256 rb_raise(rb_eArgError
, "no arguments given");
2259 for (i
= 0; i
< argc
; i
++) {
2260 VALUE str
= argv
[i
];
2263 rb_encoding
*src_enc
;
2266 p
= RSTRING_PTR(str
);
2267 end
= p
+ RSTRING_LEN(str
);
2268 src_enc
= rb_enc_get(str
);
2270 buf
= rb_reg_preprocess(p
, end
, src_enc
, &fixed_enc
, err
);
2273 rb_raise(rb_eArgError
, "%s", err
);
2275 if (fixed_enc
!= 0) {
2276 if (regexp_enc
!= 0 && regexp_enc
!= fixed_enc
) {
2277 rb_raise(rb_eArgError
, "encoding mismatch in dynamic regexp : %s and %s",
2278 rb_enc_name(regexp_enc
), rb_enc_name(fixed_enc
));
2280 regexp_enc
= fixed_enc
;
2284 result
= rb_str_new3(str
);
2286 rb_str_buf_append(result
, str
);
2289 rb_enc_associate(result
, regexp_enc
);
2296 rb_reg_initialize(VALUE obj
, const char *s
, int len
, rb_encoding
*enc
,
2297 int options
, onig_errmsg_buffer err
)
2299 struct RRegexp
*re
= RREGEXP(obj
);
2301 rb_encoding
*fixed_enc
= 0;
2302 rb_encoding
*a_enc
= rb_ascii8bit_encoding();
2304 if (!OBJ_UNTRUSTED(obj
) && rb_safe_level() >= 4)
2305 rb_raise(rb_eSecurityError
, "Insecure: can't modify regexp");
2306 rb_check_frozen(obj
);
2307 if (FL_TEST(obj
, REG_LITERAL
))
2308 rb_raise(rb_eSecurityError
, "can't modify literal regexp");
2310 rb_raise(rb_eTypeError
, "already initialized regexp");
2313 unescaped
= rb_reg_preprocess(s
, s
+len
, enc
, &fixed_enc
, err
);
2314 if (unescaped
== Qnil
)
2318 if ((fixed_enc
!= enc
&& (options
& ARG_ENCODING_FIXED
)) ||
2319 (fixed_enc
!= a_enc
&& (options
& ARG_ENCODING_NONE
))) {
2320 strcpy(err
, "incompatible character encoding");
2323 if (fixed_enc
!= a_enc
) {
2324 options
|= ARG_ENCODING_FIXED
;
2328 else if (!(options
& ARG_ENCODING_FIXED
)) {
2329 enc
= rb_usascii_encoding();
2332 rb_enc_associate((VALUE
)re
, enc
);
2333 if ((options
& ARG_ENCODING_FIXED
) || fixed_enc
) {
2334 re
->basic
.flags
|= KCODE_FIXED
;
2336 if (options
& ARG_ENCODING_NONE
) {
2337 re
->basic
.flags
|= REG_ENCODING_NONE
;
2340 re
->ptr
= make_regexp(RSTRING_PTR(unescaped
), RSTRING_LEN(unescaped
), enc
,
2341 options
& ARG_REG_OPTION_MASK
, err
);
2342 if (!re
->ptr
) return -1;
2343 re
->src
= rb_enc_str_new(s
, len
, enc
);
2344 OBJ_FREEZE(re
->src
);
2345 RB_GC_GUARD(unescaped
);
2350 rb_reg_initialize_str(VALUE obj
, VALUE str
, int options
, onig_errmsg_buffer err
)
2353 rb_encoding
*enc
= rb_enc_get(str
);
2354 if (options
& ARG_ENCODING_NONE
) {
2355 rb_encoding
*ascii8bit
= rb_ascii8bit_encoding();
2356 if (enc
!= ascii8bit
) {
2357 if (rb_enc_str_coderange(str
) != ENC_CODERANGE_7BIT
) {
2358 strcpy(err
, "/.../n has a non escaped non ASCII character in non ASCII-8BIT script");
2364 ret
= rb_reg_initialize(obj
, RSTRING_PTR(str
), RSTRING_LEN(str
), enc
,
2371 rb_reg_s_alloc(VALUE klass
)
2373 NEWOBJ(re
, struct RRegexp
);
2374 OBJSETUP(re
, klass
, T_REGEXP
);
2384 rb_reg_new_str(VALUE s
, int options
)
2386 VALUE re
= rb_reg_s_alloc(rb_cRegexp
);
2387 onig_errmsg_buffer err
= "";
2389 if (rb_reg_initialize_str(re
, s
, options
, err
) != 0) {
2390 rb_reg_raise_str(s
, options
, err
);
2397 rb_reg_new_ary(VALUE ary
, int opt
)
2399 return rb_reg_new_str(rb_reg_preprocess_dregexp(ary
), opt
);
2403 rb_enc_reg_new(const char *s
, long len
, rb_encoding
*enc
, int options
)
2405 VALUE re
= rb_reg_s_alloc(rb_cRegexp
);
2406 onig_errmsg_buffer err
= "";
2408 if (rb_reg_initialize(re
, s
, len
, enc
, options
, err
) != 0) {
2409 rb_enc_reg_raise(s
, len
, enc
, options
, err
);
2416 rb_reg_new(const char *s
, long len
, int options
)
2418 return rb_enc_reg_new(s
, len
, rb_ascii8bit_encoding(), options
);
2422 rb_reg_compile(VALUE str
, int options
)
2424 VALUE re
= rb_reg_s_alloc(rb_cRegexp
);
2425 onig_errmsg_buffer err
= "";
2427 if (!str
) str
= rb_str_new(0,0);
2428 if (rb_reg_initialize_str(re
, str
, options
, err
) != 0) {
2429 rb_set_errinfo(rb_reg_error_desc(str
, options
, err
));
2432 FL_SET(re
, REG_LITERAL
);
2436 static VALUE reg_cache
;
2439 rb_reg_regcomp(VALUE str
)
2441 volatile VALUE save_str
= str
;
2442 if (reg_cache
&& RREGEXP_SRC_LEN(reg_cache
) == RSTRING_LEN(str
)
2443 && ENCODING_GET(reg_cache
) == ENCODING_GET(str
)
2444 && memcmp(RREGEXP_SRC_PTR(reg_cache
), RSTRING_PTR(str
), RSTRING_LEN(str
)) == 0)
2447 return reg_cache
= rb_reg_new_str(save_str
, 0);
2452 * rxp.hash => fixnum
2454 * Produce a hash based on the text and options of this regular expression.
2458 rb_reg_hash(VALUE re
)
2464 hashval
= RREGEXP(re
)->ptr
->options
;
2465 len
= RREGEXP_SRC_LEN(re
);
2466 p
= RREGEXP_SRC_PTR(re
);
2468 hashval
= hashval
* 33 + *p
++;
2470 hashval
= hashval
+ (hashval
>>5);
2472 return INT2FIX(hashval
);
2478 * rxp == other_rxp => true or false
2479 * rxp.eql?(other_rxp) => true or false
2481 * Equality---Two regexps are equal if their patterns are identical, they have
2482 * the same character set code, and their <code>casefold?</code> values are the
2485 * /abc/ == /abc/x #=> false
2486 * /abc/ == /abc/i #=> false
2487 * /abc/ == /abc/n #=> false
2488 * /abc/u == /abc/n #=> false
2492 rb_reg_equal(VALUE re1
, VALUE re2
)
2494 if (re1
== re2
) return Qtrue
;
2495 if (TYPE(re2
) != T_REGEXP
) return Qfalse
;
2496 rb_reg_check(re1
); rb_reg_check(re2
);
2497 if (FL_TEST(re1
, KCODE_FIXED
) != FL_TEST(re2
, KCODE_FIXED
)) return Qfalse
;
2498 if (RREGEXP(re1
)->ptr
->options
!= RREGEXP(re2
)->ptr
->options
) return Qfalse
;
2499 if (RREGEXP_SRC_LEN(re1
) != RREGEXP_SRC_LEN(re2
)) return Qfalse
;
2500 if (ENCODING_GET(re1
) != ENCODING_GET(re2
)) return Qfalse
;
2501 if (memcmp(RREGEXP_SRC_PTR(re1
), RREGEXP_SRC_PTR(re2
), RREGEXP_SRC_LEN(re1
)) == 0) {
2508 reg_operand(VALUE s
, int check
)
2511 return rb_sym_to_s(s
);
2514 VALUE tmp
= rb_check_string_type(s
);
2515 if (check
&& NIL_P(tmp
)) {
2516 rb_raise(rb_eTypeError
, "can't convert %s to String",
2517 rb_obj_classname(s
));
2524 reg_match_pos(VALUE re
, VALUE
*strp
, long pos
)
2529 rb_backref_set(Qnil
);
2532 *strp
= str
= reg_operand(str
, Qtrue
);
2535 VALUE l
= rb_str_length(str
);
2541 pos
= rb_reg_adjust_startpos(re
, str
, pos
, 0);
2543 return rb_reg_search(re
, str
, pos
, 0);
2548 * rxp =~ str => integer or nil
2550 * Match---Matches <i>rxp</i> against <i>str</i>.
2552 * /at/ =~ "input data" #=> 7
2553 * /ax/ =~ "input data" #=> nil
2555 * If <code>=~</code> is used with a regexp literal with named captures,
2556 * captured strings (or nil) is assigned to local variables named by
2557 * the capture names.
2559 * /(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/ =~ " x = y "
2563 * If it is not matched, nil is assigned for the variables.
2565 * /(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/ =~ " x = "
2569 * This assignment is implemented in the Ruby parser.
2570 * So a regexp literal is required for the assignment.
2571 * The assignment is not occur if the regexp is not a literal.
2573 * re = /(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/
2575 * p lhs # undefined local variable
2576 * p rhs # undefined local variable
2578 * A regexp interpolation, <code>#{}</code>, also disables
2581 * rhs_pat = /(?<rhs>\w+)/
2582 * /(?<lhs>\w+)\s*=\s*#{rhs_pat}/ =~ "x = y"
2583 * p lhs # undefined local variable
2588 rb_reg_match(VALUE re
, VALUE str
)
2590 long pos
= reg_match_pos(re
, &str
, 0);
2591 if (pos
< 0) return Qnil
;
2592 pos
= rb_str_sublen(str
, pos
);
2593 return LONG2FIX(pos
);
2598 * rxp === str => true or false
2600 * Case Equality---Synonym for <code>Regexp#=~</code> used in case statements.
2604 * when /^[a-z]*$/; print "Lower case\n"
2605 * when /^[A-Z]*$/; print "Upper case\n"
2606 * else; print "Mixed case\n"
2609 * <em>produces:</em>
2615 rb_reg_eqq(VALUE re
, VALUE str
)
2619 str
= reg_operand(str
, Qfalse
);
2621 rb_backref_set(Qnil
);
2624 start
= rb_reg_search(re
, str
, 0, 0);
2634 * ~ rxp => integer or nil
2636 * Match---Matches <i>rxp</i> against the contents of <code>$_</code>.
2637 * Equivalent to <code><i>rxp</i> =~ $_</code>.
2644 rb_reg_match2(VALUE re
)
2647 VALUE line
= rb_lastline_get();
2649 if (TYPE(line
) != T_STRING
) {
2650 rb_backref_set(Qnil
);
2654 start
= rb_reg_search(re
, line
, 0, 0);
2658 start
= rb_str_sublen(line
, start
);
2659 return LONG2FIX(start
);
2665 * rxp.match(str) => matchdata or nil
2666 * rxp.match(str,pos) => matchdata or nil
2668 * Returns a <code>MatchData</code> object describing the match, or
2669 * <code>nil</code> if there was no match. This is equivalent to retrieving the
2670 * value of the special variable <code>$~</code> following a normal match.
2671 * If the second parameter is present, it specifies the position in the string
2672 * to begin the search.
2674 * /(.)(.)(.)/.match("abc")[2] #=> "b"
2675 * /(.)(.)/.match("abc", 1)[2] #=> "c"
2677 * If a block is given, invoke the block with MatchData if match succeed, so
2678 * that you can write
2680 * pat.match(str) {|m| ...}
2684 * if m = pat.match(str)
2688 * The return value is a value from block execution in this case.
2692 rb_reg_match_m(int argc
, VALUE
*argv
, VALUE re
)
2694 VALUE result
, str
, initpos
;
2697 if (rb_scan_args(argc
, argv
, "11", &str
, &initpos
) == 2) {
2698 pos
= NUM2LONG(initpos
);
2704 pos
= reg_match_pos(re
, &str
, pos
);
2706 rb_backref_set(Qnil
);
2709 result
= rb_backref_get();
2710 rb_match_busy(result
);
2711 if (!NIL_P(result
) && rb_block_given_p()) {
2712 return rb_yield(result
);
2718 * Document-method: compile
2720 * Synonym for <code>Regexp.new</code>
2726 * Regexp.new(string [, options]) => regexp
2727 * Regexp.new(regexp) => regexp
2728 * Regexp.compile(string [, options]) => regexp
2729 * Regexp.compile(regexp) => regexp
2731 * Constructs a new regular expression from <i>pattern</i>, which can be either
2732 * a <code>String</code> or a <code>Regexp</code> (in which case that regexp's
2733 * options are propagated, and new options may not be specified (a change as of
2734 * Ruby 1.8). If <i>options</i> is a <code>Fixnum</code>, it should be one or
2735 * more of the constants <code>Regexp::EXTENDED</code>,
2736 * <code>Regexp::IGNORECASE</code>, and <code>Regexp::MULTILINE</code>,
2737 * <em>or</em>-ed together. Otherwise, if <i>options</i> is not
2738 * <code>nil</code>, the regexp will be case insensitive.
2740 * r1 = Regexp.new('^a-z+:\\s+\w+') #=> /^a-z+:\s+\w+/
2741 * r2 = Regexp.new('cat', true) #=> /cat/i
2742 * r3 = Regexp.new('dog', Regexp::EXTENDED) #=> /dog/x
2743 * r4 = Regexp.new(r2) #=> /cat/i
2747 rb_reg_initialize_m(int argc
, VALUE
*argv
, VALUE self
)
2749 onig_errmsg_buffer err
= "";
2756 if (argc
== 0 || argc
> 3) {
2757 rb_raise(rb_eArgError
, "wrong number of arguments");
2759 if (TYPE(argv
[0]) == T_REGEXP
) {
2763 rb_warn("flags ignored");
2766 flags
= rb_reg_options(re
);
2767 ptr
= RREGEXP_SRC_PTR(re
);
2768 len
= RREGEXP_SRC_LEN(re
);
2769 enc
= rb_enc_get(re
);
2770 if (rb_reg_initialize(self
, ptr
, len
, enc
, flags
, err
)) {
2771 str
= rb_enc_str_new(ptr
, len
, enc
);
2772 rb_reg_raise_str(str
, flags
, err
);
2777 if (FIXNUM_P(argv
[1])) flags
= FIX2INT(argv
[1]);
2778 else if (RTEST(argv
[1])) flags
= ONIG_OPTION_IGNORECASE
;
2781 if (argc
== 3 && !NIL_P(argv
[2])) {
2782 char *kcode
= StringValuePtr(argv
[2]);
2783 if (kcode
[0] == 'n' || kcode
[1] == 'N') {
2784 enc
= rb_ascii8bit_encoding();
2785 flags
|= ARG_ENCODING_FIXED
;
2788 rb_warning("encoding option is obsolete - %s", kcode
);
2792 ptr
= StringValuePtr(str
);
2794 ? rb_reg_initialize(self
, ptr
, RSTRING_LEN(str
), enc
, flags
, err
)
2795 : rb_reg_initialize_str(self
, str
, flags
, err
)) {
2796 rb_reg_raise_str(str
, flags
, err
);
2803 rb_reg_quote(VALUE str
)
2805 rb_encoding
*enc
= rb_enc_get(str
);
2809 int ascii_only
= rb_enc_str_asciionly_p(str
);
2811 s
= RSTRING_PTR(str
);
2812 send
= s
+ RSTRING_LEN(str
);
2814 c
= rb_enc_ascget(s
, send
, &clen
, enc
);
2816 s
+= mbclen(s
, send
, enc
);
2820 case '[': case ']': case '{': case '}':
2821 case '(': case ')': case '|': case '-':
2822 case '*': case '.': case '\\':
2823 case '?': case '+': case '^': case '$':
2825 case '\t': case '\f': case '\v': case '\n': case '\r':
2830 tmp
= rb_str_new3(str
);
2832 rb_enc_associate(tmp
, rb_usascii_encoding());
2837 tmp
= rb_str_new(0, RSTRING_LEN(str
)*2);
2839 rb_enc_associate(tmp
, rb_usascii_encoding());
2842 rb_enc_copy(tmp
, str
);
2844 t
= RSTRING_PTR(tmp
);
2845 /* copy upto metacharacter */
2846 memcpy(t
, RSTRING_PTR(str
), s
- RSTRING_PTR(str
));
2847 t
+= s
- RSTRING_PTR(str
);
2850 c
= rb_enc_ascget(s
, send
, &clen
, enc
);
2852 int n
= mbclen(s
, send
, enc
);
2860 case '[': case ']': case '{': case '}':
2861 case '(': case ')': case '|': case '-':
2862 case '*': case '.': case '\\':
2863 case '?': case '+': case '^': case '$':
2894 rb_str_resize(tmp
, t
- RSTRING_PTR(tmp
));
2895 OBJ_INFECT(tmp
, str
);
2902 * Regexp.escape(str) => string
2903 * Regexp.quote(str) => string
2905 * Escapes any characters that would have special meaning in a regular
2906 * expression. Returns a new escaped string, or self if no characters are
2907 * escaped. For any string,
2908 * <code>Regexp.new(Regexp.escape(<i>str</i>))=~<i>str</i></code> will be true.
2910 * Regexp.escape('\*?{}.') #=> \\\*\?\{\}\.
2915 rb_reg_s_quote(VALUE c
, VALUE str
)
2917 return rb_reg_quote(reg_operand(str
, Qtrue
));
2921 rb_reg_options(VALUE re
)
2926 options
= RREGEXP(re
)->ptr
->options
& ARG_REG_OPTION_MASK
;
2927 if (RBASIC(re
)->flags
& KCODE_FIXED
) options
|= ARG_ENCODING_FIXED
;
2928 if (RBASIC(re
)->flags
& REG_ENCODING_NONE
) options
|= ARG_ENCODING_NONE
;
2933 rb_check_regexp_type(VALUE re
)
2935 return rb_check_convert_type(re
, T_REGEXP
, "Regexp", "to_regexp");
2940 * Regexp.try_convert(obj) -> re or nil
2942 * Try to convert <i>obj</i> into a Regexp, using to_regexp method.
2943 * Returns converted regexp or nil if <i>obj</i> cannot be converted
2946 * Regexp.try_convert(/re/) #=> /re/
2947 * Regexp.try_convert("re") #=> nil
2950 * Regexp.try_convert(o) #=> nil
2951 * def o.to_regexp() /foo/ end
2952 * Regexp.try_convert(o) #=> /foo/
2956 rb_reg_s_try_convert(VALUE dummy
, VALUE re
)
2958 return rb_check_regexp_type(re
);
2962 rb_reg_s_union(VALUE self
, VALUE args0
)
2964 long argc
= RARRAY_LEN(args0
);
2968 args
[0] = rb_str_new2("(?!)");
2969 return rb_class_new_instance(1, args
, rb_cRegexp
);
2971 else if (argc
== 1) {
2972 VALUE arg
= rb_ary_entry(args0
, 0);
2973 VALUE re
= rb_check_regexp_type(arg
);
2978 quoted
= rb_reg_s_quote(Qnil
, arg
);
2979 return rb_reg_new_str(quoted
, 0);
2984 VALUE source
= rb_str_buf_new(0);
2985 rb_encoding
*result_enc
;
2987 int has_asciionly
= 0;
2988 rb_encoding
*has_ascii_compat_fixed
= 0;
2989 rb_encoding
*has_ascii_incompat
= 0;
2991 for (i
= 0; i
< argc
; i
++) {
2993 VALUE e
= rb_ary_entry(args0
, i
);
2996 rb_str_buf_cat_ascii(source
, "|");
2998 v
= rb_check_regexp_type(e
);
3000 rb_encoding
*enc
= rb_enc_get(v
);
3001 if (!rb_enc_asciicompat(enc
)) {
3002 if (!has_ascii_incompat
)
3003 has_ascii_incompat
= enc
;
3004 else if (has_ascii_incompat
!= enc
)
3005 rb_raise(rb_eArgError
, "incompatible encodings: %s and %s",
3006 rb_enc_name(has_ascii_incompat
), rb_enc_name(enc
));
3008 else if (rb_reg_fixed_encoding_p(v
)) {
3009 if (!has_ascii_compat_fixed
)
3010 has_ascii_compat_fixed
= enc
;
3011 else if (has_ascii_compat_fixed
!= enc
)
3012 rb_raise(rb_eArgError
, "incompatible encodings: %s and %s",
3013 rb_enc_name(has_ascii_compat_fixed
), rb_enc_name(enc
));
3023 enc
= rb_enc_get(e
);
3024 if (!rb_enc_str_asciicompat_p(e
)) {
3025 if (!has_ascii_incompat
)
3026 has_ascii_incompat
= enc
;
3027 else if (has_ascii_incompat
!= enc
)
3028 rb_raise(rb_eArgError
, "incompatible encodings: %s and %s",
3029 rb_enc_name(has_ascii_incompat
), rb_enc_name(enc
));
3031 else if (rb_enc_str_asciionly_p(e
)) {
3035 if (!has_ascii_compat_fixed
)
3036 has_ascii_compat_fixed
= enc
;
3037 else if (has_ascii_compat_fixed
!= enc
)
3038 rb_raise(rb_eArgError
, "incompatible encodings: %s and %s",
3039 rb_enc_name(has_ascii_compat_fixed
), rb_enc_name(enc
));
3041 v
= rb_reg_s_quote(Qnil
, e
);
3043 if (has_ascii_incompat
) {
3044 if (has_asciionly
) {
3045 rb_raise(rb_eArgError
, "ASCII incompatible encoding: %s",
3046 rb_enc_name(has_ascii_incompat
));
3048 if (has_ascii_compat_fixed
) {
3049 rb_raise(rb_eArgError
, "incompatible encodings: %s and %s",
3050 rb_enc_name(has_ascii_incompat
), rb_enc_name(has_ascii_compat_fixed
));
3055 rb_enc_copy(source
, v
);
3057 rb_str_append(source
, v
);
3060 if (has_ascii_incompat
) {
3061 result_enc
= has_ascii_incompat
;
3063 else if (has_ascii_compat_fixed
) {
3064 result_enc
= has_ascii_compat_fixed
;
3067 result_enc
= rb_ascii8bit_encoding();
3070 rb_enc_associate(source
, result_enc
);
3071 return rb_class_new_instance(1, &source
, rb_cRegexp
);
3077 * Regexp.union(pat1, pat2, ...) => new_regexp
3078 * Regexp.union(pats_ary) => new_regexp
3080 * Return a <code>Regexp</code> object that is the union of the given
3081 * <em>pattern</em>s, i.e., will match any of its parts. The <em>pattern</em>s
3082 * can be Regexp objects, in which case their options will be preserved, or
3083 * Strings. If no patterns are given, returns <code>/(?!)/</code>.
3085 * Regexp.union #=> /(?!)/
3086 * Regexp.union("penzance") #=> /penzance/
3087 * Regexp.union("a+b*c") #=> /a\+b\*c/
3088 * Regexp.union("skiing", "sledding") #=> /skiing|sledding/
3089 * Regexp.union(["skiing", "sledding"]) #=> /skiing|sledding/
3090 * Regexp.union(/dogs/, /cats/i) #=> /(?-mix:dogs)|(?i-mx:cats)/
3093 rb_reg_s_union_m(VALUE self
, VALUE args
)
3096 if (RARRAY_LEN(args
) == 1 &&
3097 !NIL_P(v
= rb_check_array_type(rb_ary_entry(args
, 0)))) {
3098 return rb_reg_s_union(self
, v
);
3100 return rb_reg_s_union(self
, args
);
3105 rb_reg_init_copy(VALUE copy
, VALUE re
)
3107 onig_errmsg_buffer err
= "";
3111 if (copy
== re
) return copy
;
3112 rb_check_frozen(copy
);
3113 /* need better argument type check */
3114 if (!rb_obj_is_instance_of(re
, rb_obj_class(copy
))) {
3115 rb_raise(rb_eTypeError
, "wrong argument type");
3118 s
= RREGEXP_SRC_PTR(re
);
3119 len
= RREGEXP_SRC_LEN(re
);
3120 if (rb_reg_initialize(copy
, s
, len
, rb_enc_get(re
), rb_reg_options(re
), err
) != 0) {
3121 rb_reg_raise(s
, len
, err
, re
);
3127 rb_reg_regsub(VALUE str
, VALUE src
, struct re_registers
*regs
, VALUE regexp
)
3132 rb_encoding
*str_enc
= rb_enc_get(str
);
3133 rb_encoding
*src_enc
= rb_enc_get(src
);
3134 int acompat
= rb_enc_asciicompat(str_enc
);
3135 #define ASCGET(s,e,cl) (acompat ? (*cl=1,s[0]) : rb_enc_ascget(s, e, cl, str_enc))
3137 p
= s
= RSTRING_PTR(str
);
3138 e
= s
+ RSTRING_LEN(str
);
3141 int c
= ASCGET(s
, e
, &clen
);
3145 s
+= mbclen(s
, e
, str_enc
);
3151 if (c
!= '\\' || s
== e
) continue;
3154 val
= rb_str_buf_new(ss
-p
);
3156 rb_enc_str_buf_cat(val
, p
, ss
-p
, str_enc
);
3158 c
= ASCGET(s
, e
, &clen
);
3160 s
+= mbclen(s
, e
, str_enc
);
3161 rb_enc_str_buf_cat(val
, ss
, s
-ss
, str_enc
);
3169 case '1': case '2': case '3': case '4':
3170 case '5': case '6': case '7': case '8': case '9':
3171 if (onig_noname_group_capture_is_active(RREGEXP(regexp
)->ptr
)) {
3180 if (s
< e
&& ASCGET(s
, e
, &clen
) == '<') {
3181 char *name
, *name_end
;
3183 name_end
= name
= s
+ clen
;
3184 while (name_end
< e
) {
3185 c
= ASCGET(name_end
, e
, &clen
);
3186 if (c
== '>') break;
3187 name_end
+= c
== -1 ? mbclen(name_end
, e
, str_enc
) : clen
;
3190 no
= name_to_backref_number(regs
, regexp
, name
, name_end
);
3191 p
= s
= name_end
+ clen
;
3195 rb_raise(rb_eRuntimeError
, "invalid group name reference format");
3199 rb_enc_str_buf_cat(val
, ss
, s
-ss
, str_enc
);
3208 rb_enc_str_buf_cat(val
, RSTRING_PTR(src
), BEG(0), src_enc
);
3212 rb_enc_str_buf_cat(val
, RSTRING_PTR(src
)+END(0), RSTRING_LEN(src
)-END(0), src_enc
);
3216 no
= regs
->num_regs
-1;
3217 while (BEG(no
) == -1 && no
> 0) no
--;
3218 if (no
== 0) continue;
3222 rb_enc_str_buf_cat(val
, s
-clen
, clen
, str_enc
);
3226 rb_enc_str_buf_cat(val
, ss
, s
-ss
, str_enc
);
3231 if (no
>= regs
->num_regs
) continue;
3232 if (BEG(no
) == -1) continue;
3233 rb_enc_str_buf_cat(val
, RSTRING_PTR(src
)+BEG(no
), END(no
)-BEG(no
), src_enc
);
3237 if (!val
) return str
;
3239 rb_enc_str_buf_cat(val
, p
, e
-p
, str_enc
);
3248 rb_warn("variable $KCODE is no longer effective");
3253 kcode_setter(VALUE val
, ID id
)
3255 rb_warn("variable $KCODE is no longer effective; ignored");
3259 ignorecase_getter(void)
3261 rb_warn("variable $= is no longer effective");
3266 ignorecase_setter(VALUE val
, ID id
)
3268 rb_warn("variable $= is no longer effective; ignored");
3274 VALUE match
= rb_backref_get();
3276 if (NIL_P(match
)) return Qnil
;
3277 rb_match_busy(match
);
3282 match_setter(VALUE val
)
3285 Check_Type(val
, T_MATCH
);
3287 rb_backref_set(val
);
3292 * Regexp.last_match => matchdata
3293 * Regexp.last_match(n) => str
3295 * The first form returns the <code>MatchData</code> object generated by the
3296 * last successful pattern match. Equivalent to reading the global variable
3297 * <code>$~</code>. The second form returns the <i>n</i>th field in this
3298 * <code>MatchData</code> object.
3299 * <em>n</em> can be a string or symbol to reference a named capture.
3301 * /c(.)t/ =~ 'cat' #=> 0
3302 * Regexp.last_match #=> #<MatchData "cat" 1:"a">
3303 * Regexp.last_match(0) #=> "cat"
3304 * Regexp.last_match(1) #=> "a"
3305 * Regexp.last_match(2) #=> nil
3307 * /(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/ =~ "var = val"
3308 * Regexp.last_match #=> #<MatchData "var = val" lhs:"var" rhs:"val">
3309 * Regexp.last_match(:lhs) #=> "var"
3310 * Regexp.last_match(:rhs) #=> "val"
3314 rb_reg_s_last_match(int argc
, VALUE
*argv
)
3318 if (argc
> 0 && rb_scan_args(argc
, argv
, "01", &nth
) == 1) {
3319 VALUE match
= rb_backref_get();
3321 if (NIL_P(match
)) return Qnil
;
3322 n
= match_backref_number(match
, nth
);
3323 return rb_reg_nth_match(n
, match
);
3325 return match_getter();
3329 re_warn(const char *s
)
3335 * Document-class: Regexp
3337 * A <code>Regexp</code> holds a regular expression, used to match a pattern
3338 * against strings. Regexps are created using the <code>/.../</code> and
3339 * <code>%r{...}</code> literals, and by the <code>Regexp::new</code>
3347 rb_eRegexpError
= rb_define_class("RegexpError", rb_eStandardError
);
3349 onigenc_set_default_caseconv_table((UChar
*)casetable
);
3350 onigenc_set_default_encoding(ONIG_ENCODING_ASCII
);
3351 onig_set_warn_func(re_warn
);
3352 onig_set_verb_warn_func(re_warn
);
3354 rb_define_virtual_variable("$~", match_getter
, match_setter
);
3355 rb_define_virtual_variable("$&", last_match_getter
, 0);
3356 rb_define_virtual_variable("$`", prematch_getter
, 0);
3357 rb_define_virtual_variable("$'", postmatch_getter
, 0);
3358 rb_define_virtual_variable("$+", last_paren_match_getter
, 0);
3360 rb_define_virtual_variable("$=", ignorecase_getter
, ignorecase_setter
);
3361 rb_define_virtual_variable("$KCODE", kcode_getter
, kcode_setter
);
3362 rb_define_virtual_variable("$-K", kcode_getter
, kcode_setter
);
3364 rb_cRegexp
= rb_define_class("Regexp", rb_cObject
);
3365 rb_define_alloc_func(rb_cRegexp
, rb_reg_s_alloc
);
3366 rb_define_singleton_method(rb_cRegexp
, "compile", rb_class_new_instance
, -1);
3367 rb_define_singleton_method(rb_cRegexp
, "quote", rb_reg_s_quote
, 1);
3368 rb_define_singleton_method(rb_cRegexp
, "escape", rb_reg_s_quote
, 1);
3369 rb_define_singleton_method(rb_cRegexp
, "union", rb_reg_s_union_m
, -2);
3370 rb_define_singleton_method(rb_cRegexp
, "last_match", rb_reg_s_last_match
, -1);
3371 rb_define_singleton_method(rb_cRegexp
, "try_convert", rb_reg_s_try_convert
, 1);
3373 rb_define_method(rb_cRegexp
, "initialize", rb_reg_initialize_m
, -1);
3374 rb_define_method(rb_cRegexp
, "initialize_copy", rb_reg_init_copy
, 1);
3375 rb_define_method(rb_cRegexp
, "hash", rb_reg_hash
, 0);
3376 rb_define_method(rb_cRegexp
, "eql?", rb_reg_equal
, 1);
3377 rb_define_method(rb_cRegexp
, "==", rb_reg_equal
, 1);
3378 rb_define_method(rb_cRegexp
, "=~", rb_reg_match
, 1);
3379 rb_define_method(rb_cRegexp
, "===", rb_reg_eqq
, 1);
3380 rb_define_method(rb_cRegexp
, "~", rb_reg_match2
, 0);
3381 rb_define_method(rb_cRegexp
, "match", rb_reg_match_m
, -1);
3382 rb_define_method(rb_cRegexp
, "to_s", rb_reg_to_s
, 0);
3383 rb_define_method(rb_cRegexp
, "inspect", rb_reg_inspect
, 0);
3384 rb_define_method(rb_cRegexp
, "source", rb_reg_source
, 0);
3385 rb_define_method(rb_cRegexp
, "casefold?", rb_reg_casefold_p
, 0);
3386 rb_define_method(rb_cRegexp
, "options", rb_reg_options_m
, 0);
3387 rb_define_method(rb_cRegexp
, "encoding", rb_obj_encoding
, 0); /* in encoding.c */
3388 rb_define_method(rb_cRegexp
, "fixed_encoding?", rb_reg_fixed_encoding_p
, 0);
3389 rb_define_method(rb_cRegexp
, "names", rb_reg_names
, 0);
3390 rb_define_method(rb_cRegexp
, "named_captures", rb_reg_named_captures
, 0);
3392 rb_define_const(rb_cRegexp
, "IGNORECASE", INT2FIX(ONIG_OPTION_IGNORECASE
));
3393 rb_define_const(rb_cRegexp
, "EXTENDED", INT2FIX(ONIG_OPTION_EXTEND
));
3394 rb_define_const(rb_cRegexp
, "MULTILINE", INT2FIX(ONIG_OPTION_MULTILINE
));
3396 rb_global_variable(®_cache
);
3398 rb_cMatch
= rb_define_class("MatchData", rb_cObject
);
3399 rb_define_alloc_func(rb_cMatch
, match_alloc
);
3400 rb_undef_method(CLASS_OF(rb_cMatch
), "new");
3402 rb_define_method(rb_cMatch
, "initialize_copy", match_init_copy
, 1);
3403 rb_define_method(rb_cMatch
, "regexp", match_regexp
, 0);
3404 rb_define_method(rb_cMatch
, "names", match_names
, 0);
3405 rb_define_method(rb_cMatch
, "size", match_size
, 0);
3406 rb_define_method(rb_cMatch
, "length", match_size
, 0);
3407 rb_define_method(rb_cMatch
, "offset", match_offset
, 1);
3408 rb_define_method(rb_cMatch
, "begin", match_begin
, 1);
3409 rb_define_method(rb_cMatch
, "end", match_end
, 1);
3410 rb_define_method(rb_cMatch
, "to_a", match_to_a
, 0);
3411 rb_define_method(rb_cMatch
, "[]", match_aref
, -1);
3412 rb_define_method(rb_cMatch
, "captures", match_captures
, 0);
3413 rb_define_method(rb_cMatch
, "values_at", match_values_at
, -1);
3414 rb_define_method(rb_cMatch
, "pre_match", rb_reg_match_pre
, 0);
3415 rb_define_method(rb_cMatch
, "post_match", rb_reg_match_post
, 0);
3416 rb_define_method(rb_cMatch
, "to_s", match_to_s
, 0);
3417 rb_define_method(rb_cMatch
, "inspect", match_inspect
, 0);
3418 rb_define_method(rb_cMatch
, "string", match_string
, 0);