* file.c (rb_find_file_ext): guard load_path from GC.
[ruby-svn.git] / re.c
blob1f473817aacbc6542d5cf57a7f18409ca9189c8d
1 /**********************************************************************
3 re.c -
5 $Author$
6 created at: Mon Aug 9 18:24:49 JST 1993
8 Copyright (C) 1993-2007 Yukihiro Matsumoto
10 **********************************************************************/
12 #include "ruby/ruby.h"
13 #include "ruby/re.h"
14 #include "ruby/encoding.h"
15 #include "ruby/util.h"
16 #include "regint.h"
17 #include <ctype.h>
19 VALUE rb_eRegexpError;
21 typedef char onig_errmsg_buffer[ONIG_MAX_ERROR_MESSAGE_LEN];
23 #define BEG(no) regs->beg[no]
24 #define END(no) regs->end[no]
26 #if 'a' == 97 /* it's ascii */
27 static const char casetable[] = {
28 '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
29 '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
30 '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
31 '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
32 /* ' ' '!' '"' '#' '$' '%' '&' ''' */
33 '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
34 /* '(' ')' '*' '+' ',' '-' '.' '/' */
35 '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
36 /* '0' '1' '2' '3' '4' '5' '6' '7' */
37 '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
38 /* '8' '9' ':' ';' '<' '=' '>' '?' */
39 '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
40 /* '@' 'A' 'B' 'C' 'D' 'E' 'F' 'G' */
41 '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
42 /* 'H' 'I' 'J' 'K' 'L' 'M' 'N' 'O' */
43 '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
44 /* 'P' 'Q' 'R' 'S' 'T' 'U' 'V' 'W' */
45 '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
46 /* 'X' 'Y' 'Z' '[' '\' ']' '^' '_' */
47 '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137',
48 /* '`' 'a' 'b' 'c' 'd' 'e' 'f' 'g' */
49 '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
50 /* 'h' 'i' 'j' 'k' 'l' 'm' 'n' 'o' */
51 '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
52 /* 'p' 'q' 'r' 's' 't' 'u' 'v' 'w' */
53 '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
54 /* 'x' 'y' 'z' '{' '|' '}' '~' */
55 '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177',
56 '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
57 '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
58 '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
59 '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
60 '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
61 '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
62 '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
63 '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
64 '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
65 '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
66 '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327',
67 '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337',
68 '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
69 '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
70 '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
71 '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377',
73 #else
74 # error >>> "You lose. You will need a translation table for your character set." <<<
75 #endif
77 int
78 rb_memcicmp(const void *x, const void *y, long len)
80 const unsigned char *p1 = x, *p2 = y;
81 int tmp;
83 while (len--) {
84 if ((tmp = casetable[(unsigned)*p1++] - casetable[(unsigned)*p2++]))
85 return tmp;
87 return 0;
90 #undef rb_memcmp
92 int
93 rb_memcmp(const void *p1, const void *p2, long len)
95 return memcmp(p1, p2, len);
98 static inline long
99 rb_memsearch_ss(const unsigned char *xs, long m, const unsigned char *ys, long n)
101 const unsigned char *x = xs, *xe = xs + m;
102 const unsigned char *y = ys, *ye = ys + n;
103 #ifndef VALUE_MAX
104 # if SIZEOF_VALUE == 8
105 # define VALUE_MAX 0xFFFFFFFFFFFFFFFFULL
106 # elif SIZEOF_VALUE == 4
107 # define VALUE_MAX 0xFFFFFFFFUL
108 # endif
109 #endif
110 VALUE hx, hy, mask = VALUE_MAX >> ((SIZEOF_VALUE - m) * CHAR_BIT);
112 if (m > SIZEOF_VALUE)
113 rb_bug("!!too long pattern string!!");
115 /* Prepare hash value */
116 for (hx = *x++, hy = *y++; x < xe; ++x, ++y) {
117 hx <<= CHAR_BIT;
118 hy <<= CHAR_BIT;
119 hx |= *x;
120 hy |= *y;
122 /* Searching */
123 while (hx != hy) {
124 if (y == ye)
125 return -1;
126 hy <<= CHAR_BIT;
127 hy |= *y;
128 hy &= mask;
129 y++;
131 return y - ys - m;
134 static inline long
135 rb_memsearch_qs(const unsigned char *xs, long m, const unsigned char *ys, long n)
137 const unsigned char *x = xs, *xe = xs + m;
138 const unsigned char *y = ys;
139 VALUE i, qstable[256];
141 /* Preprocessing */
142 for (i = 0; i < 256; ++i)
143 qstable[i] = m + 1;
144 for (; x < xe; ++x)
145 qstable[*x] = xe - x;
146 /* Searching */
147 for (; y + m <= ys + n; y += *(qstable + y[m])) {
148 if (*xs == *y && memcmp(xs, y, m) == 0)
149 return y - ys;
151 return -1;
154 static inline unsigned int
155 rb_memsearch_qs_utf8_hash(const unsigned char *x)
157 register const unsigned int mix = 8353;
158 register unsigned int h = *x;
159 if (h < 0xC0) {
160 return h + 256;
162 else if (h < 0xE0) {
163 h *= mix;
164 h += x[1];
166 else if (h < 0xF0) {
167 h *= mix;
168 h += x[1];
169 h *= mix;
170 h += x[2];
172 else if (h < 0xF5) {
173 h *= mix;
174 h += x[1];
175 h *= mix;
176 h += x[2];
177 h *= mix;
178 h += x[3];
180 else {
181 return h + 256;
183 return (unsigned char)h;
186 static inline long
187 rb_memsearch_qs_utf8(const unsigned char *xs, long m, const unsigned char *ys, long n)
189 const unsigned char *x = xs, *xe = xs + m;
190 const unsigned char *y = ys;
191 VALUE i, qstable[512];
193 /* Preprocessing */
194 for (i = 0; i < 512; ++i) {
195 qstable[i] = m + 1;
197 for (; x < xe; ++x) {
198 qstable[rb_memsearch_qs_utf8_hash(x)] = xe - x;
200 /* Searching */
201 for (; y + m <= ys + n; y += qstable[rb_memsearch_qs_utf8_hash(y+m)]) {
202 if (*xs == *y && memcmp(xs, y, m) == 0)
203 return y - ys;
205 return -1;
208 long
209 rb_memsearch(const void *x0, long m, const void *y0, long n, rb_encoding *enc)
211 const unsigned char *x = x0, *y = y0;
213 if (m > n) return -1;
214 else if (m == n) {
215 return memcmp(x0, y0, m) == 0 ? 0 : -1;
217 else if (m < 1) {
218 return 0;
220 else if (m == 1) {
221 const unsigned char *ys = y, *ye = ys + n;
222 for (; y < ye; ++y) {
223 if (*x == *y)
224 return y - ys;
226 return -1;
228 else if (m <= SIZEOF_VALUE) {
229 return rb_memsearch_ss(x0, m, y0, n);
231 else if (enc == rb_utf8_encoding()){
232 return rb_memsearch_qs_utf8(x0, m, y0, n);
234 else {
235 return rb_memsearch_qs(x0, m, y0, n);
239 #define REG_LITERAL FL_USER5
240 #define REG_ENCODING_NONE FL_USER6
241 #define REG_BUSY FL_USER7
243 #define KCODE_FIXED FL_USER4
245 #define ARG_REG_OPTION_MASK \
246 (ONIG_OPTION_IGNORECASE|ONIG_OPTION_MULTILINE|ONIG_OPTION_EXTEND)
247 #define ARG_ENCODING_FIXED 16
248 #define ARG_ENCODING_NONE 32
250 static int
251 char_to_option(int c)
253 int val;
255 switch (c) {
256 case 'i':
257 val = ONIG_OPTION_IGNORECASE;
258 break;
259 case 'x':
260 val = ONIG_OPTION_EXTEND;
261 break;
262 case 'm':
263 val = ONIG_OPTION_MULTILINE;
264 break;
265 default:
266 val = 0;
267 break;
269 return val;
272 static char *
273 option_to_str(char str[4], int options)
275 char *p = str;
276 if (options & ONIG_OPTION_MULTILINE) *p++ = 'm';
277 if (options & ONIG_OPTION_IGNORECASE) *p++ = 'i';
278 if (options & ONIG_OPTION_EXTEND) *p++ = 'x';
279 *p = 0;
280 return str;
283 extern int
284 rb_char_to_option_kcode(int c, int *option, int *kcode)
286 *option = 0;
288 switch (c) {
289 case 'n':
290 *kcode = -1;
291 return (*option = ARG_ENCODING_NONE);
292 case 'e':
293 *kcode = rb_enc_find_index("EUC-JP");
294 break;
295 case 's':
296 *kcode = rb_enc_find_index("Windows-31J");
297 break;
298 case 'u':
299 *kcode = rb_enc_find_index("UTF-8");
300 break;
301 default:
302 *kcode = -1;
303 return (*option = char_to_option(c));
305 *option = ARG_ENCODING_FIXED;
306 return 1;
309 static void
310 rb_reg_check(VALUE re)
312 if (!RREGEXP(re)->ptr || !RREGEXP(re)->str) {
313 rb_raise(rb_eTypeError, "uninitialized Regexp");
317 static void
318 rb_reg_expr_str(VALUE str, const char *s, long len)
320 rb_encoding *enc = rb_enc_get(str);
321 const char *p, *pend;
322 int need_escape = 0;
323 int c, clen;
325 p = s; pend = p + len;
326 while (p<pend) {
327 c = rb_enc_ascget(p, pend, &clen, enc);
328 if (c == -1) {
329 p += mbclen(p, pend, enc);
331 else if (c != '/' && rb_enc_isprint(c, enc)) {
332 p += clen;
334 else {
335 need_escape = 1;
336 break;
339 if (!need_escape) {
340 rb_str_buf_cat(str, s, len);
342 else {
343 p = s;
344 while (p<pend) {
345 c = rb_enc_ascget(p, pend, &clen, enc);
346 if (c == '\\' && p+clen < pend) {
347 int n = clen + mbclen(p+clen, pend, enc);
348 rb_str_buf_cat(str, p, n);
349 p += n;
350 continue;
352 else if (c == '/') {
353 char c = '\\';
354 rb_str_buf_cat(str, &c, 1);
355 rb_str_buf_cat(str, p, clen);
357 else if (c == -1) {
358 int l = mbclen(p, pend, enc);
359 rb_str_buf_cat(str, p, l);
360 p += l;
361 continue;
363 else if (rb_enc_isprint(c, enc)) {
364 rb_str_buf_cat(str, p, clen);
366 else if (!rb_enc_isspace(c, enc)) {
367 char b[8];
369 sprintf(b, "\\x%02X", c);
370 rb_str_buf_cat(str, b, 4);
372 else {
373 rb_str_buf_cat(str, p, clen);
375 p += clen;
380 static VALUE
381 rb_reg_desc(const char *s, long len, VALUE re)
383 VALUE str = rb_str_buf_new2("/");
385 rb_enc_copy(str, re);
386 rb_reg_expr_str(str, s, len);
387 rb_str_buf_cat2(str, "/");
388 if (re) {
389 char opts[4];
390 rb_reg_check(re);
391 if (*option_to_str(opts, RREGEXP(re)->ptr->options))
392 rb_str_buf_cat2(str, opts);
394 OBJ_INFECT(str, re);
395 return str;
400 * call-seq:
401 * rxp.source => str
403 * Returns the original string of the pattern.
405 * /ab+c/ix.source #=> "ab+c"
407 * Note that escape sequences are retained as is.
409 * /\x20\+/.source #=> "\\x20\\+"
413 static VALUE
414 rb_reg_source(VALUE re)
416 VALUE str;
418 rb_reg_check(re);
419 str = rb_enc_str_new(RREGEXP(re)->str,RREGEXP(re)->len, rb_enc_get(re));
420 if (OBJ_TAINTED(re)) OBJ_TAINT(str);
421 return str;
425 * call-seq:
426 * rxp.inspect => string
428 * Produce a nicely formatted string-version of _rxp_. Perhaps surprisingly,
429 * <code>#inspect</code> actually produces the more natural version of
430 * the string than <code>#to_s</code>.
432 * /ab+c/ix.inspect #=> "/ab+c/ix"
436 static VALUE
437 rb_reg_inspect(VALUE re)
439 rb_reg_check(re);
440 return rb_reg_desc(RREGEXP(re)->str, RREGEXP(re)->len, re);
445 * call-seq:
446 * rxp.to_s => str
448 * Returns a string containing the regular expression and its options (using the
449 * <code>(?opts:source)</code> notation. This string can be fed back in to
450 * <code>Regexp::new</code> to a regular expression with the same semantics as
451 * the original. (However, <code>Regexp#==</code> may not return true when
452 * comparing the two, as the source of the regular expression itself may
453 * differ, as the example shows). <code>Regexp#inspect</code> produces a
454 * generally more readable version of <i>rxp</i>.
456 * r1 = /ab+c/ix #=> /ab+c/ix
457 * s1 = r1.to_s #=> "(?ix-m:ab+c)"
458 * r2 = Regexp.new(s1) #=> /(?ix-m:ab+c)/
459 * r1 == r2 #=> false
460 * r1.source #=> "ab+c"
461 * r2.source #=> "(?ix-m:ab+c)"
464 static VALUE
465 rb_reg_to_s(VALUE re)
467 int options, opt;
468 const int embeddable = ONIG_OPTION_MULTILINE|ONIG_OPTION_IGNORECASE|ONIG_OPTION_EXTEND;
469 long len;
470 const UChar* ptr;
471 VALUE str = rb_str_buf_new2("(?");
472 char optbuf[5];
474 rb_reg_check(re);
476 rb_enc_copy(str, re);
477 options = RREGEXP(re)->ptr->options;
478 ptr = (UChar*)RREGEXP(re)->str;
479 len = RREGEXP(re)->len;
480 again:
481 if (len >= 4 && ptr[0] == '(' && ptr[1] == '?') {
482 int err = 1;
483 ptr += 2;
484 if ((len -= 2) > 0) {
485 do {
486 opt = char_to_option((int )*ptr);
487 if (opt != 0) {
488 options |= opt;
490 else {
491 break;
493 ++ptr;
494 } while (--len > 0);
496 if (len > 1 && *ptr == '-') {
497 ++ptr;
498 --len;
499 do {
500 opt = char_to_option((int )*ptr);
501 if (opt != 0) {
502 options &= ~opt;
504 else {
505 break;
507 ++ptr;
508 } while (--len > 0);
510 if (*ptr == ')') {
511 --len;
512 ++ptr;
513 goto again;
515 if (*ptr == ':' && ptr[len-1] == ')') {
516 int r;
517 Regexp *rp;
518 r = onig_alloc_init(&rp, ONIG_OPTION_DEFAULT,
519 ONIGENC_CASE_FOLD_DEFAULT,
520 rb_enc_get(re),
521 OnigDefaultSyntax);
522 if (r == 0) {
523 ++ptr;
524 len -= 2;
525 err = (onig_compile(rp, ptr, ptr + len, NULL) != 0);
527 onig_free(rp);
529 if (err) {
530 options = RREGEXP(re)->ptr->options;
531 ptr = (UChar*)RREGEXP(re)->str;
532 len = RREGEXP(re)->len;
536 if (*option_to_str(optbuf, options)) rb_str_buf_cat2(str, optbuf);
538 if ((options & embeddable) != embeddable) {
539 optbuf[0] = '-';
540 option_to_str(optbuf + 1, ~options);
541 rb_str_buf_cat2(str, optbuf);
544 rb_str_buf_cat2(str, ":");
545 rb_reg_expr_str(str, (char*)ptr, len);
546 rb_str_buf_cat2(str, ")");
547 rb_enc_copy(str, re);
549 OBJ_INFECT(str, re);
550 return str;
553 static void
554 rb_reg_raise(const char *s, long len, const char *err, VALUE re)
556 VALUE desc = rb_reg_desc(s, len, re);
558 rb_raise(rb_eRegexpError, "%s: %s", err, RSTRING_PTR(desc));
561 static VALUE
562 rb_enc_reg_error_desc(const char *s, long len, rb_encoding *enc, int options, const char *err)
564 char opts[6];
565 VALUE desc = rb_str_buf_new2(err);
567 rb_enc_associate(desc, enc);
568 rb_str_buf_cat2(desc, ": /");
569 rb_reg_expr_str(desc, s, len);
570 opts[0] = '/';
571 option_to_str(opts + 1, options);
572 rb_str_buf_cat2(desc, opts);
573 return rb_exc_new3(rb_eRegexpError, desc);
576 static void
577 rb_enc_reg_raise(const char *s, long len, rb_encoding *enc, int options, const char *err)
579 rb_exc_raise(rb_enc_reg_error_desc(s, len, enc, options, err));
582 static VALUE
583 rb_reg_error_desc(VALUE str, int options, const char *err)
585 return rb_enc_reg_error_desc(RSTRING_PTR(str), RSTRING_LEN(str),
586 rb_enc_get(str), options, err);
589 static void
590 rb_reg_raise_str(VALUE str, int options, const char *err)
592 rb_exc_raise(rb_reg_error_desc(str, options, err));
597 * call-seq:
598 * rxp.casefold? => true or false
600 * Returns the value of the case-insensitive flag.
602 * /a/.casefold? #=> false
603 * /a/i.casefold? #=> true
604 * /(?i:a)/.casefold? #=> false
607 static VALUE
608 rb_reg_casefold_p(VALUE re)
610 rb_reg_check(re);
611 if (RREGEXP(re)->ptr->options & ONIG_OPTION_IGNORECASE) return Qtrue;
612 return Qfalse;
617 * call-seq:
618 * rxp.options => fixnum
620 * Returns the set of bits corresponding to the options used when creating this
621 * Regexp (see <code>Regexp::new</code> for details. Note that additional bits
622 * may be set in the returned options: these are used internally by the regular
623 * expression code. These extra bits are ignored if the options are passed to
624 * <code>Regexp::new</code>.
626 * Regexp::IGNORECASE #=> 1
627 * Regexp::EXTENDED #=> 2
628 * Regexp::MULTILINE #=> 4
630 * /cat/.options #=> 0
631 * /cat/ix.options #=> 3
632 * Regexp.new('cat', true).options #=> 1
633 * /\xa1\xa2/e.options #=> 16
635 * r = /cat/ix
636 * Regexp.new(r.source, r.options) #=> /cat/ix
639 static VALUE
640 rb_reg_options_m(VALUE re)
642 int options = rb_reg_options(re);
643 return INT2NUM(options);
646 static int
647 reg_names_iter(const OnigUChar *name, const OnigUChar *name_end,
648 int back_num, int *back_refs, OnigRegex regex, void *arg)
650 VALUE ary = (VALUE)arg;
651 rb_ary_push(ary, rb_str_new((const char *)name, name_end-name));
652 return 0;
656 * call-seq:
657 * rxp.names => [name1, name2, ...]
659 * Returns a list of names of captures as an array of strings.
661 * /(?<foo>.)(?<bar>.)(?<baz>.)/.names
662 * #=> ["foo", "bar", "baz"]
664 * /(?<foo>.)(?<foo>.)/.names
665 * #=> ["foo"]
667 * /(.)(.)/.names
668 * #=> []
671 static VALUE
672 rb_reg_names(VALUE re)
674 VALUE ary = rb_ary_new();
675 onig_foreach_name(RREGEXP(re)->ptr, reg_names_iter, (void*)ary);
676 return ary;
679 static int
680 reg_named_captures_iter(const OnigUChar *name, const OnigUChar *name_end,
681 int back_num, int *back_refs, OnigRegex regex, void *arg)
683 VALUE hash = (VALUE)arg;
684 VALUE ary = rb_ary_new2(back_num);
685 int i;
687 for(i = 0; i < back_num; i++)
688 rb_ary_store(ary, i, INT2NUM(back_refs[i]));
690 rb_hash_aset(hash, rb_str_new((const char*)name, name_end-name),ary);
692 return 0;
696 * call-seq:
697 * rxp.named_captures => hash
699 * Returns a hash representing information about named captures of <i>rxp</i>.
701 * A key of the hash is a name of the named captures.
702 * A value of the hash is an array which is list of indexes of corresponding
703 * named captures.
705 * /(?<foo>.)(?<bar>.)/.named_captures
706 * #=> {"foo"=>[1], "bar"=>[2]}
708 * /(?<foo>.)(?<foo>.)/.named_captures
709 * #=> {"foo"=>[1, 2]}
711 * If there are no named captures, an empty hash is returned.
713 * /(.)(.)/.named_captures
714 * #=> {}
717 static VALUE
718 rb_reg_named_captures(VALUE re)
720 VALUE hash = rb_hash_new();
721 onig_foreach_name(RREGEXP(re)->ptr, reg_named_captures_iter, (void*)hash);
722 return hash;
725 static Regexp*
726 make_regexp(const char *s, long len, rb_encoding *enc, int flags, onig_errmsg_buffer err)
728 Regexp *rp;
729 int r;
730 OnigErrorInfo einfo;
732 /* Handle escaped characters first. */
734 /* Build a copy of the string (in dest) with the
735 escaped characters translated, and generate the regex
736 from that.
739 r = onig_alloc_init(&rp, flags, ONIGENC_CASE_FOLD_DEFAULT,
740 enc, OnigDefaultSyntax);
741 if (r) {
742 onig_error_code_to_str((UChar*)err, r);
743 return 0;
746 r = onig_compile(rp, (UChar*)s, (UChar*)(s + len), &einfo);
748 if (r != 0) {
749 onig_free(rp);
750 (void )onig_error_code_to_str((UChar*)err, r, &einfo);
751 return 0;
753 return rp;
758 * Document-class: MatchData
760 * <code>MatchData</code> is the type of the special variable <code>$~</code>,
761 * and is the type of the object returned by <code>Regexp#match</code> and
762 * <code>Regexp.last_match</code>. It encapsulates all the results of a pattern
763 * match, results normally accessed through the special variables
764 * <code>$&</code>, <code>$'</code>, <code>$`</code>, <code>$1</code>,
765 * <code>$2</code>, and so on.
769 VALUE rb_cMatch;
771 static VALUE
772 match_alloc(VALUE klass)
774 NEWOBJ(match, struct RMatch);
775 OBJSETUP(match, klass, T_MATCH);
777 match->str = 0;
778 match->rmatch = 0;
779 match->regexp = 0;
780 match->rmatch = ALLOC(struct rmatch);
781 MEMZERO(match->rmatch, struct rmatch, 1);
783 return (VALUE)match;
786 typedef struct {
787 int byte_pos;
788 int char_pos;
789 } pair_t;
791 static int
792 pair_byte_cmp(const void *pair1, const void *pair2)
794 return ((pair_t*)pair1)->byte_pos - ((pair_t*)pair2)->byte_pos;
797 static void
798 update_char_offset(VALUE match)
800 struct rmatch *rm = RMATCH(match)->rmatch;
801 struct re_registers *regs;
802 int num_regs;
803 int i, num_pos, c;
804 char *s, *p, *q, *e;
805 rb_encoding *enc;
806 pair_t *pairs;
808 if (rm->char_offset_updated)
809 return;
811 regs = &rm->regs;
812 num_regs = rm->regs.num_regs;
814 if (rm->char_offset_num_allocated < num_regs) {
815 REALLOC_N(rm->char_offset, struct rmatch_offset, num_regs);
816 rm->char_offset_num_allocated = num_regs;
819 enc = rb_enc_get(RMATCH(match)->str);
820 if (rb_enc_mbmaxlen(enc) == 1) {
821 for (i = 0; i < num_regs; i++) {
822 rm->char_offset[i].beg = BEG(i);
823 rm->char_offset[i].end = END(i);
825 rm->char_offset_updated = 1;
826 return;
829 pairs = ALLOCA_N(pair_t, num_regs*2);
830 num_pos = 0;
831 for (i = 0; i < num_regs; i++) {
832 if (BEG(i) < 0)
833 continue;
834 pairs[num_pos++].byte_pos = BEG(i);
835 pairs[num_pos++].byte_pos = END(i);
837 qsort(pairs, num_pos, sizeof(pair_t), pair_byte_cmp);
839 s = p = RSTRING_PTR(RMATCH(match)->str);
840 e = s + RSTRING_LEN(RMATCH(match)->str);
841 c = 0;
842 for (i = 0; i < num_pos; i++) {
843 q = s + pairs[i].byte_pos;
844 c += rb_enc_strlen(p, q, enc);
845 pairs[i].char_pos = c;
846 p = q;
849 for (i = 0; i < num_regs; i++) {
850 pair_t key, *found;
851 if (BEG(i) < 0) {
852 rm->char_offset[i].beg = -1;
853 rm->char_offset[i].end = -1;
854 continue;
857 key.byte_pos = BEG(i);
858 found = bsearch(&key, pairs, num_pos, sizeof(pair_t), pair_byte_cmp);
859 rm->char_offset[i].beg = found->char_pos;
861 key.byte_pos = END(i);
862 found = bsearch(&key, pairs, num_pos, sizeof(pair_t), pair_byte_cmp);
863 rm->char_offset[i].end = found->char_pos;
866 rm->char_offset_updated = 1;
869 /* :nodoc: */
870 static VALUE
871 match_init_copy(VALUE obj, VALUE orig)
873 struct rmatch *rm;
875 if (obj == orig) return obj;
877 if (!rb_obj_is_instance_of(orig, rb_obj_class(obj))) {
878 rb_raise(rb_eTypeError, "wrong argument class");
880 RMATCH(obj)->str = RMATCH(orig)->str;
881 RMATCH(obj)->regexp = RMATCH(orig)->regexp;
883 rm = RMATCH(obj)->rmatch;
884 onig_region_free(&rm->regs, 0);
885 rm->regs.allocated = 0;
887 onig_region_copy(&rm->regs, RMATCH_REGS(orig));
889 if (!RMATCH(orig)->rmatch->char_offset_updated) {
890 rm->char_offset_updated = 0;
892 else {
893 if (rm->char_offset_num_allocated < rm->regs.num_regs) {
894 REALLOC_N(rm->char_offset, struct rmatch_offset, rm->regs.num_regs);
895 rm->char_offset_num_allocated = rm->regs.num_regs;
897 MEMCPY(rm->char_offset, RMATCH(orig)->rmatch->char_offset,
898 struct rmatch_offset, rm->regs.num_regs);
899 rm->char_offset_updated = 1;
902 return obj;
907 * call-seq:
908 * mtch.regexp => regexp
910 * Returns the regexp.
912 * m = /a.*b/.match("abc")
913 * m.regexp #=> /a.*b/
916 static VALUE
917 match_regexp(VALUE match)
919 return RMATCH(match)->regexp;
923 * call-seq:
924 * mtch.names => [name1, name2, ...]
926 * Returns a list of names of captures as an array of strings.
927 * It is same as mtch.regexp.names.
929 * /(?<foo>.)(?<bar>.)(?<baz>.)/.match("hoge").names
930 * #=> ["foo", "bar", "baz"]
932 * m = /(?<x>.)(?<y>.)?/.match("a") #=> #<MatchData "a" x:"a" y:nil>
933 * m.names #=> ["x", "y"]
936 static VALUE
937 match_names(VALUE match)
939 return rb_reg_names(RMATCH(match)->regexp);
943 * call-seq:
944 * mtch.length => integer
945 * mtch.size => integer
947 * Returns the number of elements in the match array.
949 * m = /(.)(.)(\d+)(\d)/.match("THX1138.")
950 * m.length #=> 5
951 * m.size #=> 5
954 static VALUE
955 match_size(VALUE match)
957 return INT2FIX(RMATCH_REGS(match)->num_regs);
960 static int
961 match_backref_number(VALUE match, VALUE backref)
963 const char *name;
964 int num;
966 struct re_registers *regs = RMATCH_REGS(match);
967 VALUE regexp = RMATCH(match)->regexp;
969 switch(TYPE(backref)) {
970 default:
971 return NUM2INT(backref);
973 case T_SYMBOL:
974 name = rb_id2name(SYM2ID(backref));
975 break;
977 case T_STRING:
978 name = StringValueCStr(backref);
979 break;
982 num = onig_name_to_backref_number(RREGEXP(regexp)->ptr,
983 (const unsigned char*)name,
984 (const unsigned char*)name + strlen(name),
985 regs);
987 if (num < 1) {
988 rb_raise(rb_eIndexError, "undefined group name reference: %s", name);
991 return num;
996 * call-seq:
997 * mtch.offset(n) => array
999 * Returns a two-element array containing the beginning and ending offsets of
1000 * the <em>n</em>th match.
1001 * <em>n</em> can be a string or symbol to reference a named capture.
1003 * m = /(.)(.)(\d+)(\d)/.match("THX1138.")
1004 * m.offset(0) #=> [1, 7]
1005 * m.offset(4) #=> [6, 7]
1007 * m = /(?<foo>.)(.)(?<bar>.)/.match("hoge")
1008 * p m.offset(:foo) #=> [0, 1]
1009 * p m.offset(:bar) #=> [2, 3]
1013 static VALUE
1014 match_offset(VALUE match, VALUE n)
1016 int i = match_backref_number(match, n);
1017 struct re_registers *regs = RMATCH_REGS(match);
1019 if (i < 0 || regs->num_regs <= i)
1020 rb_raise(rb_eIndexError, "index %d out of matches", i);
1022 if (BEG(i) < 0)
1023 return rb_assoc_new(Qnil, Qnil);
1025 update_char_offset(match);
1026 return rb_assoc_new(INT2FIX(RMATCH(match)->rmatch->char_offset[i].beg),
1027 INT2FIX(RMATCH(match)->rmatch->char_offset[i].end));
1032 * call-seq:
1033 * mtch.begin(n) => integer
1035 * Returns the offset of the start of the <em>n</em>th element of the match
1036 * array in the string.
1037 * <em>n</em> can be a string or symbol to reference a named capture.
1039 * m = /(.)(.)(\d+)(\d)/.match("THX1138.")
1040 * m.begin(0) #=> 1
1041 * m.begin(2) #=> 2
1043 * m = /(?<foo>.)(.)(?<bar>.)/.match("hoge")
1044 * p m.begin(:foo) #=> 0
1045 * p m.begin(:bar) #=> 2
1048 static VALUE
1049 match_begin(VALUE match, VALUE n)
1051 int i = match_backref_number(match, n);
1052 struct re_registers *regs = RMATCH_REGS(match);
1054 if (i < 0 || regs->num_regs <= i)
1055 rb_raise(rb_eIndexError, "index %d out of matches", i);
1057 if (BEG(i) < 0)
1058 return Qnil;
1060 update_char_offset(match);
1061 return INT2FIX(RMATCH(match)->rmatch->char_offset[i].beg);
1066 * call-seq:
1067 * mtch.end(n) => integer
1069 * Returns the offset of the character immediately following the end of the
1070 * <em>n</em>th element of the match array in the string.
1071 * <em>n</em> can be a string or symbol to reference a named capture.
1073 * m = /(.)(.)(\d+)(\d)/.match("THX1138.")
1074 * m.end(0) #=> 7
1075 * m.end(2) #=> 3
1077 * m = /(?<foo>.)(.)(?<bar>.)/.match("hoge")
1078 * p m.end(:foo) #=> 1
1079 * p m.end(:bar) #=> 3
1082 static VALUE
1083 match_end(VALUE match, VALUE n)
1085 int i = match_backref_number(match, n);
1086 struct re_registers *regs = RMATCH_REGS(match);
1088 if (i < 0 || regs->num_regs <= i)
1089 rb_raise(rb_eIndexError, "index %d out of matches", i);
1091 if (BEG(i) < 0)
1092 return Qnil;
1094 update_char_offset(match);
1095 return INT2FIX(RMATCH(match)->rmatch->char_offset[i].end);
1098 #define MATCH_BUSY FL_USER2
1100 void
1101 rb_match_busy(VALUE match)
1103 FL_SET(match, MATCH_BUSY);
1107 * call-seq:
1108 * rxp.fixed_encoding? => true or false
1110 * Returns false if rxp is applicable to
1111 * a string with any ASCII compatible encoding.
1112 * Returns true otherwise.
1114 * r = /a/
1115 * r.fixed_encoding? #=> false
1116 * r =~ "\u{6666} a" #=> 2
1117 * r =~ "\xa1\xa2 a".force_encoding("euc-jp") #=> 2
1118 * r =~ "abc".force_encoding("euc-jp") #=> 0
1120 * r = /a/u
1121 * r.fixed_encoding? #=> true
1122 * r.encoding #=> #<Encoding:UTF-8>
1123 * r =~ "\u{6666} a" #=> 2
1124 * r =~ "\xa1\xa2".force_encoding("euc-jp") #=> ArgumentError
1125 * r =~ "abc".force_encoding("euc-jp") #=> 0
1127 * r = /\u{6666}/
1128 * r.fixed_encoding? #=> true
1129 * r.encoding #=> #<Encoding:UTF-8>
1130 * r =~ "\u{6666} a" #=> 0
1131 * r =~ "\xa1\xa2".force_encoding("euc-jp") #=> ArgumentError
1132 * r =~ "abc".force_encoding("euc-jp") #=> nil
1135 static VALUE
1136 rb_reg_fixed_encoding_p(VALUE re)
1138 if (FL_TEST(re, KCODE_FIXED))
1139 return Qtrue;
1140 else
1141 return Qfalse;
1144 static VALUE
1145 rb_reg_preprocess(const char *p, const char *end, rb_encoding *enc,
1146 rb_encoding **fixed_enc, onig_errmsg_buffer err);
1149 static rb_encoding*
1150 rb_reg_prepare_enc(VALUE re, VALUE str, int warn)
1152 rb_encoding *enc = 0;
1154 if (rb_enc_str_coderange(str) == ENC_CODERANGE_BROKEN) {
1155 rb_raise(rb_eArgError,
1156 "broken %s string",
1157 rb_enc_name(rb_enc_get(str)));
1160 rb_reg_check(re);
1161 /* ignorecase status */
1162 if (rb_reg_fixed_encoding_p(re) || !rb_enc_str_asciicompat_p(str)) {
1163 if (ENCODING_GET(re) != rb_enc_get_index(str) &&
1164 rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT) {
1165 rb_raise(rb_eArgError,
1166 "incompatible encoding regexp match (%s regexp with %s string)",
1167 rb_enc_name(rb_enc_from_index(ENCODING_GET(re))),
1168 rb_enc_name(rb_enc_get(str)));
1171 else {
1172 enc = rb_enc_get(str);
1173 if (warn && (RBASIC(re)->flags & REG_ENCODING_NONE) &&
1174 enc != rb_ascii8bit_encoding() &&
1175 rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT) {
1176 rb_warn("regexp match /.../n against to %s string",
1177 rb_enc_name(enc));
1179 return enc;
1181 return RREGEXP(re)->ptr->enc;
1184 static regex_t *
1185 rb_reg_prepare_re(VALUE re, rb_encoding *enc)
1187 regex_t *reg = RREGEXP(re)->ptr;
1188 onig_errmsg_buffer err = "";
1189 int r;
1190 OnigErrorInfo einfo;
1191 const char *pattern;
1192 VALUE unescaped;
1193 rb_encoding *fixed_enc = 0;
1195 if (reg->enc == enc) return reg;
1197 rb_reg_check(re);
1198 reg = RREGEXP(re)->ptr;
1199 pattern = RREGEXP(re)->str;
1201 unescaped = rb_reg_preprocess(
1202 pattern, pattern + RREGEXP(re)->len, enc,
1203 &fixed_enc, err);
1205 if (unescaped == Qnil) {
1206 rb_raise(rb_eArgError, "regexp preprocess failed: %s", err);
1209 r = onig_new(&reg, (UChar* )RSTRING_PTR(unescaped),
1210 (UChar* )(RSTRING_PTR(unescaped) + RSTRING_LEN(unescaped)),
1211 reg->options, enc,
1212 OnigDefaultSyntax, &einfo);
1213 if (r) {
1214 onig_error_code_to_str((UChar*)err, r, &einfo);
1215 rb_reg_raise(pattern, RREGEXP(re)->len, err, re);
1218 RB_GC_GUARD(unescaped);
1219 return reg;
1223 rb_reg_adjust_startpos(VALUE re, VALUE str, int pos, int reverse)
1225 int range;
1226 rb_encoding *enc;
1227 UChar *p, *string;
1229 enc = rb_reg_prepare_enc(re, str, 0);
1231 if (reverse) {
1232 range = -pos;
1234 else {
1235 range = RSTRING_LEN(str) - pos;
1238 if (pos > 0 && ONIGENC_MBC_MAXLEN(enc) != 1 && pos < RSTRING_LEN(str)) {
1239 string = (UChar*)RSTRING_PTR(str);
1241 if (range > 0) {
1242 p = onigenc_get_right_adjust_char_head(enc, string, string + pos);
1244 else {
1245 p = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, string, string + pos);
1247 return p - string;
1250 return pos;
1254 rb_reg_search(VALUE re, VALUE str, int pos, int reverse)
1256 int result;
1257 VALUE match;
1258 struct re_registers regs;
1259 char *range = RSTRING_PTR(str);
1260 regex_t *reg0 = RREGEXP(re)->ptr, *reg;
1261 int busy = FL_TEST(re, REG_BUSY);
1263 if (pos > RSTRING_LEN(str) || pos < 0) {
1264 rb_backref_set(Qnil);
1265 return -1;
1268 reg = rb_reg_prepare_re(re, rb_reg_prepare_enc(re, str, 1));
1270 FL_SET(re, REG_BUSY);
1271 if (!reverse) {
1272 range += RSTRING_LEN(str);
1274 MEMZERO(&regs, struct re_registers, 1);
1275 result = onig_search(reg,
1276 (UChar*)(RSTRING_PTR(str)),
1277 ((UChar*)(RSTRING_PTR(str)) + RSTRING_LEN(str)),
1278 ((UChar*)(RSTRING_PTR(str)) + pos),
1279 ((UChar*)range),
1280 &regs, ONIG_OPTION_NONE);
1282 if (RREGEXP(re)->ptr != reg) {
1283 if (busy) {
1284 onig_free(reg);
1286 else {
1287 onig_free(reg0);
1288 RREGEXP(re)->ptr = reg;
1291 if (!busy) FL_UNSET(re, REG_BUSY);
1292 if (result < 0) {
1293 if (result == ONIG_MISMATCH) {
1294 rb_backref_set(Qnil);
1295 return result;
1297 else {
1298 onig_errmsg_buffer err = "";
1299 onig_error_code_to_str((UChar*)err, result);
1300 rb_reg_raise(RREGEXP(re)->str, RREGEXP(re)->len, err, 0);
1304 match = rb_backref_get();
1305 if (NIL_P(match) || FL_TEST(match, MATCH_BUSY)) {
1306 match = match_alloc(rb_cMatch);
1308 else {
1309 if (rb_safe_level() >= 3)
1310 OBJ_TAINT(match);
1311 else
1312 FL_UNSET(match, FL_TAINT);
1315 onig_region_copy(RMATCH_REGS(match), &regs);
1316 RMATCH(match)->str = rb_str_new4(str);
1317 RMATCH(match)->regexp = re;
1318 RMATCH(match)->rmatch->char_offset_updated = 0;
1319 rb_backref_set(match);
1321 OBJ_INFECT(match, re);
1322 OBJ_INFECT(match, str);
1324 return result;
1327 VALUE
1328 rb_reg_nth_defined(int nth, VALUE match)
1330 struct re_registers *regs;
1331 if (NIL_P(match)) return Qnil;
1332 regs = RMATCH_REGS(match);
1333 if (nth >= regs->num_regs) {
1334 return Qnil;
1336 if (nth < 0) {
1337 nth += regs->num_regs;
1338 if (nth <= 0) return Qnil;
1340 if (BEG(nth) == -1) return Qfalse;
1341 return Qtrue;
1344 VALUE
1345 rb_reg_nth_match(int nth, VALUE match)
1347 VALUE str;
1348 long start, end, len;
1349 struct re_registers *regs;
1351 if (NIL_P(match)) return Qnil;
1352 regs = RMATCH_REGS(match);
1353 if (nth >= regs->num_regs) {
1354 return Qnil;
1356 if (nth < 0) {
1357 nth += regs->num_regs;
1358 if (nth <= 0) return Qnil;
1360 start = BEG(nth);
1361 if (start == -1) return Qnil;
1362 end = END(nth);
1363 len = end - start;
1364 str = rb_str_subseq(RMATCH(match)->str, start, len);
1365 OBJ_INFECT(str, match);
1366 return str;
1369 VALUE
1370 rb_reg_last_match(VALUE match)
1372 return rb_reg_nth_match(0, match);
1377 * call-seq:
1378 * mtch.pre_match => str
1380 * Returns the portion of the original string before the current match.
1381 * Equivalent to the special variable <code>$`</code>.
1383 * m = /(.)(.)(\d+)(\d)/.match("THX1138.")
1384 * m.pre_match #=> "T"
1387 VALUE
1388 rb_reg_match_pre(VALUE match)
1390 VALUE str;
1391 struct re_registers *regs;
1393 if (NIL_P(match)) return Qnil;
1394 regs = RMATCH_REGS(match);
1395 if (BEG(0) == -1) return Qnil;
1396 str = rb_str_subseq(RMATCH(match)->str, 0, BEG(0));
1397 if (OBJ_TAINTED(match)) OBJ_TAINT(str);
1398 return str;
1403 * call-seq:
1404 * mtch.post_match => str
1406 * Returns the portion of the original string after the current match.
1407 * Equivalent to the special variable <code>$'</code>.
1409 * m = /(.)(.)(\d+)(\d)/.match("THX1138: The Movie")
1410 * m.post_match #=> ": The Movie"
1413 VALUE
1414 rb_reg_match_post(VALUE match)
1416 VALUE str;
1417 long pos;
1418 struct re_registers *regs;
1420 if (NIL_P(match)) return Qnil;
1421 regs = RMATCH_REGS(match);
1422 if (BEG(0) == -1) return Qnil;
1423 str = RMATCH(match)->str;
1424 pos = END(0);
1425 str = rb_str_subseq(str, pos, RSTRING_LEN(str) - pos);
1426 if (OBJ_TAINTED(match)) OBJ_TAINT(str);
1427 return str;
1430 VALUE
1431 rb_reg_match_last(VALUE match)
1433 int i;
1434 struct re_registers *regs;
1436 if (NIL_P(match)) return Qnil;
1437 regs = RMATCH_REGS(match);
1438 if (BEG(0) == -1) return Qnil;
1440 for (i=regs->num_regs-1; BEG(i) == -1 && i > 0; i--)
1442 if (i == 0) return Qnil;
1443 return rb_reg_nth_match(i, match);
1446 static VALUE
1447 last_match_getter(void)
1449 return rb_reg_last_match(rb_backref_get());
1452 static VALUE
1453 prematch_getter(void)
1455 return rb_reg_match_pre(rb_backref_get());
1458 static VALUE
1459 postmatch_getter(void)
1461 return rb_reg_match_post(rb_backref_get());
1464 static VALUE
1465 last_paren_match_getter(void)
1467 return rb_reg_match_last(rb_backref_get());
1470 static VALUE
1471 match_array(VALUE match, int start)
1473 struct re_registers *regs = RMATCH_REGS(match);
1474 VALUE ary = rb_ary_new2(regs->num_regs);
1475 VALUE target = RMATCH(match)->str;
1476 int i;
1477 int taint = OBJ_TAINTED(match);
1479 for (i=start; i<regs->num_regs; i++) {
1480 if (regs->beg[i] == -1) {
1481 rb_ary_push(ary, Qnil);
1483 else {
1484 VALUE str = rb_str_subseq(target, regs->beg[i], regs->end[i]-regs->beg[i]);
1485 if (taint) OBJ_TAINT(str);
1486 rb_ary_push(ary, str);
1489 return ary;
1493 /* [MG]:FIXME: I put parens around the /.../.match() in the first line of the
1494 second example to prevent the '*' followed by a '/' from ending the
1495 comment. */
1498 * call-seq:
1499 * mtch.to_a => anArray
1501 * Returns the array of matches.
1503 * m = /(.)(.)(\d+)(\d)/.match("THX1138.")
1504 * m.to_a #=> ["HX1138", "H", "X", "113", "8"]
1506 * Because <code>to_a</code> is called when expanding
1507 * <code>*</code><em>variable</em>, there's a useful assignment
1508 * shortcut for extracting matched fields. This is slightly slower than
1509 * accessing the fields directly (as an intermediate array is
1510 * generated).
1512 * all,f1,f2,f3 = *(/(.)(.)(\d+)(\d)/.match("THX1138."))
1513 * all #=> "HX1138"
1514 * f1 #=> "H"
1515 * f2 #=> "X"
1516 * f3 #=> "113"
1519 static VALUE
1520 match_to_a(VALUE match)
1522 return match_array(match, 0);
1527 * call-seq:
1528 * mtch.captures => array
1530 * Returns the array of captures; equivalent to <code>mtch.to_a[1..-1]</code>.
1532 * f1,f2,f3,f4 = /(.)(.)(\d+)(\d)/.match("THX1138.").captures
1533 * f1 #=> "H"
1534 * f2 #=> "X"
1535 * f3 #=> "113"
1536 * f4 #=> "8"
1538 static VALUE
1539 match_captures(VALUE match)
1541 return match_array(match, 1);
1544 static int
1545 name_to_backref_number(struct re_registers *regs, VALUE regexp, const char* name, const char* name_end)
1547 int num;
1549 num = onig_name_to_backref_number(RREGEXP(regexp)->ptr,
1550 (const unsigned char* )name, (const unsigned char* )name_end, regs);
1551 if (num >= 1) {
1552 return num;
1554 else {
1555 VALUE s = rb_str_new(name, (long )(name_end - name));
1556 rb_raise(rb_eIndexError, "undefined group name reference: %s",
1557 StringValuePtr(s));
1562 * call-seq:
1563 * mtch[i] => str or nil
1564 * mtch[start, length] => array
1565 * mtch[range] => array
1566 * mtch[name] => str or nil
1568 * Match Reference---<code>MatchData</code> acts as an array, and may be
1569 * accessed using the normal array indexing techniques. <i>mtch</i>[0] is
1570 * equivalent to the special variable <code>$&</code>, and returns the entire
1571 * matched string. <i>mtch</i>[1], <i>mtch</i>[2], and so on return the values
1572 * of the matched backreferences (portions of the pattern between parentheses).
1574 * m = /(.)(.)(\d+)(\d)/.match("THX1138.")
1575 * m #=> #<MatchData "HX1138" 1:"H" 2:"X" 3:"113" 4:"8">
1576 * m[0] #=> "HX1138"
1577 * m[1, 2] #=> ["H", "X"]
1578 * m[1..3] #=> ["H", "X", "113"]
1579 * m[-3, 2] #=> ["X", "113"]
1581 * m = /(?<foo>a+)b/.match("ccaaab")
1582 * m #=> #<MatchData "aaab" foo:"aaa">
1583 * m["foo"] #=> "aaa"
1584 * m[:foo] #=> "aaa"
1587 static VALUE
1588 match_aref(int argc, VALUE *argv, VALUE match)
1590 VALUE idx, rest;
1592 rb_scan_args(argc, argv, "11", &idx, &rest);
1594 if (NIL_P(rest)) {
1595 if (FIXNUM_P(idx)) {
1596 if (FIX2INT(idx) >= 0) {
1597 return rb_reg_nth_match(FIX2INT(idx), match);
1600 else {
1601 const char *p;
1602 int num;
1604 switch (TYPE(idx)) {
1605 case T_SYMBOL:
1606 p = rb_id2name(SYM2ID(idx));
1607 goto name_to_backref;
1608 break;
1609 case T_STRING:
1610 p = StringValuePtr(idx);
1612 name_to_backref:
1613 num = name_to_backref_number(RMATCH_REGS(match),
1614 RMATCH(match)->regexp, p, p + strlen(p));
1615 return rb_reg_nth_match(num, match);
1616 break;
1618 default:
1619 break;
1624 return rb_ary_aref(argc, argv, match_to_a(match));
1627 static VALUE
1628 match_entry(VALUE match, long n)
1630 return rb_reg_nth_match(n, match);
1635 * call-seq:
1637 * mtch.values_at([index]*) => array
1639 * Uses each <i>index</i> to access the matching values, returning an array of
1640 * the corresponding matches.
1642 * m = /(.)(.)(\d+)(\d)/.match("THX1138: The Movie")
1643 * m.to_a #=> ["HX1138", "H", "X", "113", "8"]
1644 * m.values_at(0, 2, -2) #=> ["HX1138", "X", "113"]
1647 static VALUE
1648 match_values_at(int argc, VALUE *argv, VALUE match)
1650 struct re_registers *regs = RMATCH_REGS(match);
1651 return rb_get_values_at(match, regs->num_regs, argc, argv, match_entry);
1656 * call-seq:
1657 * mtch.to_s => str
1659 * Returns the entire matched string.
1661 * m = /(.)(.)(\d+)(\d)/.match("THX1138.")
1662 * m.to_s #=> "HX1138"
1665 static VALUE
1666 match_to_s(VALUE match)
1668 VALUE str = rb_reg_last_match(match);
1670 if (NIL_P(str)) str = rb_str_new(0,0);
1671 if (OBJ_TAINTED(match)) OBJ_TAINT(str);
1672 if (OBJ_TAINTED(RMATCH(match)->str)) OBJ_TAINT(str);
1673 return str;
1678 * call-seq:
1679 * mtch.string => str
1681 * Returns a frozen copy of the string passed in to <code>match</code>.
1683 * m = /(.)(.)(\d+)(\d)/.match("THX1138.")
1684 * m.string #=> "THX1138."
1687 static VALUE
1688 match_string(VALUE match)
1690 return RMATCH(match)->str; /* str is frozen */
1693 struct backref_name_tag {
1694 const UChar *name;
1695 long len;
1698 static int
1699 match_inspect_name_iter(const OnigUChar *name, const OnigUChar *name_end,
1700 int back_num, int *back_refs, OnigRegex regex, void *arg0)
1702 struct backref_name_tag *arg = (struct backref_name_tag *)arg0;
1703 int i;
1705 for (i = 0; i < back_num; i++) {
1706 arg[back_refs[i]].name = name;
1707 arg[back_refs[i]].len = name_end - name;
1709 return 0;
1713 * call-seq:
1714 * mtch.inspect => str
1716 * Returns a printable version of <i>mtch</i>.
1718 * puts /.$/.match("foo").inspect
1719 * #=> #<MatchData "o">
1721 * puts /(.)(.)(.)/.match("foo").inspect
1722 * #=> #<MatchData "foo" 1:"f" 2:"o" 3:"o">
1724 * puts /(.)(.)?(.)/.match("fo").inspect
1725 * #=> #<MatchData "fo" 1:"f" 2:nil 3:"o">
1727 * puts /(?<foo>.)(?<bar>.)(?<baz>.)/.match("hoge").inspect
1728 * #=> #<MatchData "hog" foo:"h" bar:"o" baz:"g">
1732 static VALUE
1733 match_inspect(VALUE match)
1735 char *cname = rb_obj_classname(match);
1736 VALUE str;
1737 int i;
1738 struct re_registers *regs = RMATCH_REGS(match);
1739 int num_regs = regs->num_regs;
1740 struct backref_name_tag *names;
1741 VALUE regexp = RMATCH(match)->regexp;
1743 if (regexp == 0) {
1744 return rb_sprintf("#<%s:%p>", cname, (void*)match);
1747 names = ALLOCA_N(struct backref_name_tag, num_regs);
1748 MEMZERO(names, struct backref_name_tag, num_regs);
1750 onig_foreach_name(RREGEXP(regexp)->ptr,
1751 match_inspect_name_iter, names);
1753 str = rb_str_buf_new2("#<");
1754 rb_str_buf_cat2(str, cname);
1756 for (i = 0; i < num_regs; i++) {
1757 VALUE v;
1758 rb_str_buf_cat2(str, " ");
1759 if (0 < i) {
1760 if (names[i].name)
1761 rb_str_buf_cat(str, (const char *)names[i].name, names[i].len);
1762 else {
1763 char buf[sizeof(i)*3+1];
1764 snprintf(buf, sizeof(buf), "%d", i);
1765 rb_str_buf_cat2(str, buf);
1767 rb_str_buf_cat2(str, ":");
1769 v = rb_reg_nth_match(i, match);
1770 if (v == Qnil)
1771 rb_str_buf_cat2(str, "nil");
1772 else
1773 rb_str_buf_append(str, rb_str_inspect(v));
1775 rb_str_buf_cat2(str, ">");
1777 return str;
1780 VALUE rb_cRegexp;
1782 static int
1783 read_escaped_byte(const char **pp, const char *end, onig_errmsg_buffer err)
1785 const char *p = *pp;
1786 int code;
1787 int meta_prefix = 0, ctrl_prefix = 0;
1788 int len;
1789 int retbyte;
1791 retbyte = -1;
1792 if (p == end || *p++ != '\\') {
1793 strcpy(err, "too short escaped multibyte character");
1794 return -1;
1797 again:
1798 if (p == end) {
1799 strcpy(err, "too short escape sequence");
1800 return -1;
1802 switch (*p++) {
1803 case '\\': code = '\\'; break;
1804 case 'n': code = '\n'; break;
1805 case 't': code = '\t'; break;
1806 case 'r': code = '\r'; break;
1807 case 'f': code = '\f'; break;
1808 case 'v': code = '\013'; break;
1809 case 'a': code = '\007'; break;
1810 case 'e': code = '\033'; break;
1812 /* \OOO */
1813 case '0': case '1': case '2': case '3':
1814 case '4': case '5': case '6': case '7':
1815 p--;
1816 code = ruby_scan_oct(p, end < p+3 ? end-p : 3, &len);
1817 p += len;
1818 break;
1820 case 'x': /* \xHH */
1821 code = ruby_scan_hex(p, end < p+2 ? end-p : 2, &len);
1822 if (len < 1) {
1823 strcpy(err, "invalid hex escape");
1824 return -1;
1826 p += len;
1827 break;
1829 case 'M': /* \M-X, \M-\C-X, \M-\cX */
1830 if (meta_prefix) {
1831 strcpy(err, "duplicate meta escape");
1832 return -1;
1834 meta_prefix = 1;
1835 if (p+1 < end && *p++ == '-' && (*p & 0x80) == 0) {
1836 if (*p == '\\') {
1837 p++;
1838 goto again;
1840 else {
1841 code = *p++;
1842 break;
1845 strcpy(err, "too short meta escape");
1846 return -1;
1848 case 'C': /* \C-X, \C-\M-X */
1849 if (p == end || *p++ != '-') {
1850 strcpy(err, "too short control escape");
1851 return -1;
1853 case 'c': /* \cX, \c\M-X */
1854 if (ctrl_prefix) {
1855 strcpy(err, "duplicate control escape");
1856 return -1;
1858 ctrl_prefix = 1;
1859 if (p < end && (*p & 0x80) == 0) {
1860 if (*p == '\\') {
1861 p++;
1862 goto again;
1864 else {
1865 code = *p++;
1866 break;
1869 strcpy(err, "too short control escape");
1870 return -1;
1872 default:
1873 strcpy(err, "unexpected escape sequence");
1874 return -1;
1876 if (code < 0 || 0xff < code) {
1877 strcpy(err, "invalid escape code");
1878 return -1;
1881 if (ctrl_prefix)
1882 code &= 0x1f;
1883 if (meta_prefix)
1884 code |= 0x80;
1886 *pp = p;
1887 return code;
1890 static int
1891 unescape_escaped_nonascii(const char **pp, const char *end, rb_encoding *enc,
1892 VALUE buf, rb_encoding **encp, onig_errmsg_buffer err)
1894 const char *p = *pp;
1895 int chmaxlen = rb_enc_mbmaxlen(enc);
1896 char *chbuf = ALLOCA_N(char, chmaxlen);
1897 int chlen = 0;
1898 int byte;
1899 int l;
1901 memset(chbuf, 0, chmaxlen);
1903 byte = read_escaped_byte(&p, end, err);
1904 if (byte == -1) {
1905 return -1;
1908 chbuf[chlen++] = byte;
1909 while (chlen < chmaxlen &&
1910 MBCLEN_NEEDMORE_P(rb_enc_precise_mbclen(chbuf, chbuf+chlen, enc))) {
1911 byte = read_escaped_byte(&p, end, err);
1912 if (byte == -1) {
1913 return -1;
1915 chbuf[chlen++] = byte;
1918 l = rb_enc_precise_mbclen(chbuf, chbuf+chlen, enc);
1919 if (MBCLEN_INVALID_P(l)) {
1920 strcpy(err, "invalid multibyte escape");
1921 return -1;
1923 if (1 < chlen || (chbuf[0] & 0x80)) {
1924 rb_str_buf_cat(buf, chbuf, chlen);
1926 if (*encp == 0)
1927 *encp = enc;
1928 else if (*encp != enc) {
1929 strcpy(err, "escaped non ASCII character in UTF-8 regexp");
1930 return -1;
1933 else {
1934 char escbuf[5];
1935 snprintf(escbuf, sizeof(escbuf), "\\x%02X", chbuf[0]&0xff);
1936 rb_str_buf_cat(buf, escbuf, 4);
1938 *pp = p;
1939 return 0;
1942 static int
1943 check_unicode_range(unsigned long code, onig_errmsg_buffer err)
1945 if ((0xd800 <= code && code <= 0xdfff) || /* Surrogates */
1946 0x10ffff < code) {
1947 strcpy(err, "invalid Unicode range");
1948 return -1;
1950 return 0;
1953 static int
1954 append_utf8(unsigned long uv,
1955 VALUE buf, rb_encoding **encp, onig_errmsg_buffer err)
1957 if (check_unicode_range(uv, err) != 0)
1958 return -1;
1959 if (uv < 0x80) {
1960 char escbuf[5];
1961 snprintf(escbuf, sizeof(escbuf), "\\x%02X", (int)uv);
1962 rb_str_buf_cat(buf, escbuf, 4);
1964 else {
1965 int len;
1966 char utf8buf[6];
1967 len = rb_uv_to_utf8(utf8buf, uv);
1968 rb_str_buf_cat(buf, utf8buf, len);
1970 if (*encp == 0)
1971 *encp = rb_utf8_encoding();
1972 else if (*encp != rb_utf8_encoding()) {
1973 strcpy(err, "UTF-8 character in non UTF-8 regexp");
1974 return -1;
1977 return 0;
1980 static int
1981 unescape_unicode_list(const char **pp, const char *end,
1982 VALUE buf, rb_encoding **encp, onig_errmsg_buffer err)
1984 const char *p = *pp;
1985 int has_unicode = 0;
1986 unsigned long code;
1987 int len;
1989 while (p < end && ISSPACE(*p)) p++;
1991 while (1) {
1992 code = ruby_scan_hex(p, end-p, &len);
1993 if (len == 0)
1994 break;
1995 if (6 < len) { /* max 10FFFF */
1996 strcpy(err, "invalid Unicode range");
1997 return -1;
1999 p += len;
2000 if (append_utf8(code, buf, encp, err) != 0)
2001 return -1;
2002 has_unicode = 1;
2004 while (p < end && ISSPACE(*p)) p++;
2007 if (has_unicode == 0) {
2008 strcpy(err, "invalid Unicode list");
2009 return -1;
2012 *pp = p;
2014 return 0;
2017 static int
2018 unescape_unicode_bmp(const char **pp, const char *end,
2019 VALUE buf, rb_encoding **encp, onig_errmsg_buffer err)
2021 const char *p = *pp;
2022 int len;
2023 unsigned long code;
2025 if (end < p+4) {
2026 strcpy(err, "invalid Unicode escape");
2027 return -1;
2029 code = ruby_scan_hex(p, 4, &len);
2030 if (len != 4) {
2031 strcpy(err, "invalid Unicode escape");
2032 return -1;
2034 if (append_utf8(code, buf, encp, err) != 0)
2035 return -1;
2036 *pp = p + 4;
2037 return 0;
2040 static int
2041 unescape_nonascii(const char *p, const char *end, rb_encoding *enc,
2042 VALUE buf, rb_encoding **encp, onig_errmsg_buffer err)
2044 char c;
2045 char smallbuf[2];
2047 while (p < end) {
2048 int chlen = rb_enc_precise_mbclen(p, end, enc);
2049 if (!MBCLEN_CHARFOUND_P(chlen)) {
2050 strcpy(err, "invalid multibyte character");
2051 return -1;
2053 chlen = MBCLEN_CHARFOUND_LEN(chlen);
2054 if (1 < chlen || (*p & 0x80)) {
2055 rb_str_buf_cat(buf, p, chlen);
2056 p += chlen;
2057 if (*encp == 0)
2058 *encp = enc;
2059 else if (*encp != enc) {
2060 strcpy(err, "non ASCII character in UTF-8 regexp");
2061 return -1;
2063 continue;
2066 switch (c = *p++) {
2067 case '\\':
2068 if (p == end) {
2069 strcpy(err, "too short escape sequence");
2070 return -1;
2072 switch (c = *p++) {
2073 case '1': case '2': case '3':
2074 case '4': case '5': case '6': case '7': /* \O, \OO, \OOO or backref */
2076 int octlen;
2077 if (ruby_scan_oct(p-1, end-(p-1), &octlen) <= 0177) {
2078 /* backref or 7bit octal.
2079 no need to unescape anyway.
2080 re-escaping may break backref */
2081 goto escape_asis;
2084 /* xxx: How about more than 199 subexpressions? */
2086 case '0': /* \0, \0O, \0OO */
2088 case 'x': /* \xHH */
2089 case 'c': /* \cX, \c\M-X */
2090 case 'C': /* \C-X, \C-\M-X */
2091 case 'M': /* \M-X, \M-\C-X, \M-\cX */
2092 p = p-2;
2093 if (unescape_escaped_nonascii(&p, end, enc, buf, encp, err) != 0)
2094 return -1;
2095 break;
2097 case 'u':
2098 if (p == end) {
2099 strcpy(err, "too short escape sequence");
2100 return -1;
2102 if (*p == '{') {
2103 /* \u{H HH HHH HHHH HHHHH HHHHHH ...} */
2104 p++;
2105 if (unescape_unicode_list(&p, end, buf, encp, err) != 0)
2106 return -1;
2107 if (p == end || *p++ != '}') {
2108 strcpy(err, "invalid Unicode list");
2109 return -1;
2111 break;
2113 else {
2114 /* \uHHHH */
2115 if (unescape_unicode_bmp(&p, end, buf, encp, err) != 0)
2116 return -1;
2117 break;
2120 default: /* \n, \\, \d, \9, etc. */
2121 escape_asis:
2122 smallbuf[0] = '\\';
2123 smallbuf[1] = c;
2124 rb_str_buf_cat(buf, smallbuf, 2);
2125 break;
2127 break;
2129 default:
2130 rb_str_buf_cat(buf, &c, 1);
2131 break;
2135 return 0;
2138 static VALUE
2139 rb_reg_preprocess(const char *p, const char *end, rb_encoding *enc,
2140 rb_encoding **fixed_enc, onig_errmsg_buffer err)
2142 VALUE buf;
2144 buf = rb_str_buf_new(0);
2146 if (rb_enc_asciicompat(enc))
2147 *fixed_enc = 0;
2148 else {
2149 *fixed_enc = enc;
2150 rb_enc_associate(buf, enc);
2153 if (unescape_nonascii(p, end, enc, buf, fixed_enc, err) != 0)
2154 return Qnil;
2156 if (*fixed_enc) {
2157 rb_enc_associate(buf, *fixed_enc);
2160 return buf;
2163 VALUE
2164 rb_reg_check_preprocess(VALUE str)
2166 rb_encoding *fixed_enc = 0;
2167 onig_errmsg_buffer err = "";
2168 VALUE buf;
2169 char *p, *end;
2170 rb_encoding *enc;
2172 StringValue(str);
2173 p = RSTRING_PTR(str);
2174 end = p + RSTRING_LEN(str);
2175 enc = rb_enc_get(str);
2177 buf = rb_reg_preprocess(p, end, enc, &fixed_enc, err);
2178 RB_GC_GUARD(str);
2180 if (buf == Qnil) {
2181 return rb_reg_error_desc(str, 0, err);
2183 return Qnil;
2186 static VALUE
2187 rb_reg_preprocess_dregexp(VALUE ary)
2189 rb_encoding *fixed_enc = 0;
2190 rb_encoding *regexp_enc = 0;
2191 onig_errmsg_buffer err = "";
2192 int i;
2193 VALUE result = 0;
2194 int argc = RARRAY_LEN(ary);
2195 VALUE *argv = RARRAY_PTR(ary);
2197 if (argc == 0) {
2198 rb_raise(rb_eArgError, "no arguments given");
2201 for (i = 0; i < argc; i++) {
2202 VALUE str = argv[i];
2203 VALUE buf;
2204 char *p, *end;
2205 rb_encoding *src_enc;
2207 StringValue(str);
2208 p = RSTRING_PTR(str);
2209 end = p + RSTRING_LEN(str);
2210 src_enc = rb_enc_get(str);
2212 buf = rb_reg_preprocess(p, end, src_enc, &fixed_enc, err);
2214 if (buf == Qnil)
2215 rb_raise(rb_eArgError, "%s", err);
2217 if (fixed_enc != 0) {
2218 if (regexp_enc != 0 && regexp_enc != fixed_enc) {
2219 rb_raise(rb_eArgError, "encoding mismatch in dynamic regexp : %s and %s",
2220 rb_enc_name(regexp_enc), rb_enc_name(fixed_enc));
2222 regexp_enc = fixed_enc;
2225 if (!result)
2226 result = rb_str_new3(str);
2227 else
2228 rb_str_buf_append(result, str);
2230 if (regexp_enc) {
2231 rb_enc_associate(result, regexp_enc);
2234 return result;
2237 static int
2238 rb_reg_initialize(VALUE obj, const char *s, int len, rb_encoding *enc,
2239 int options, onig_errmsg_buffer err)
2241 struct RRegexp *re = RREGEXP(obj);
2242 VALUE unescaped;
2243 rb_encoding *fixed_enc = 0;
2244 rb_encoding *a_enc = rb_ascii8bit_encoding();
2246 if (!OBJ_TAINTED(obj) && rb_safe_level() >= 4)
2247 rb_raise(rb_eSecurityError, "Insecure: can't modify regexp");
2248 rb_check_frozen(obj);
2249 if (FL_TEST(obj, REG_LITERAL))
2250 rb_raise(rb_eSecurityError, "can't modify literal regexp");
2251 if (re->ptr) onig_free(re->ptr);
2252 if (re->str) free(re->str);
2253 re->ptr = 0;
2254 re->str = 0;
2256 unescaped = rb_reg_preprocess(s, s+len, enc, &fixed_enc, err);
2257 if (unescaped == Qnil)
2258 return -1;
2260 if (fixed_enc) {
2261 if ((fixed_enc != enc && (options & ARG_ENCODING_FIXED)) ||
2262 (fixed_enc != a_enc && (options & ARG_ENCODING_NONE))) {
2263 strcpy(err, "incompatible character encoding");
2264 return -1;
2266 if (fixed_enc != a_enc) {
2267 options |= ARG_ENCODING_FIXED;
2268 enc = fixed_enc;
2271 else if (!(options & ARG_ENCODING_FIXED)) {
2272 enc = rb_usascii_encoding();
2275 rb_enc_associate((VALUE)re, enc);
2276 if ((options & ARG_ENCODING_FIXED) || fixed_enc) {
2277 re->basic.flags |= KCODE_FIXED;
2279 if (options & ARG_ENCODING_NONE) {
2280 re->basic.flags |= REG_ENCODING_NONE;
2283 re->ptr = make_regexp(RSTRING_PTR(unescaped), RSTRING_LEN(unescaped), enc,
2284 options & ARG_REG_OPTION_MASK, err);
2285 if (!re->ptr) return -1;
2286 re->str = ALLOC_N(char, len+1);
2287 memcpy(re->str, s, len);
2288 re->str[len] = '\0';
2289 re->len = len;
2290 RB_GC_GUARD(unescaped);
2291 return 0;
2294 static int
2295 rb_reg_initialize_str(VALUE obj, VALUE str, int options, onig_errmsg_buffer err)
2297 int ret;
2298 rb_encoding *enc = rb_enc_get(str);
2299 if (options & ARG_ENCODING_NONE) {
2300 rb_encoding *ascii8bit = rb_ascii8bit_encoding();
2301 if (enc != ascii8bit) {
2302 if (rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT) {
2303 strcpy(err, "/.../n has a non escaped non ASCII character in non ASCII-8BIT script");
2304 return -1;
2306 enc = ascii8bit;
2309 ret = rb_reg_initialize(obj, RSTRING_PTR(str), RSTRING_LEN(str), enc,
2310 options, err);
2311 RB_GC_GUARD(str);
2312 return ret;
2315 static VALUE
2316 rb_reg_s_alloc(VALUE klass)
2318 NEWOBJ(re, struct RRegexp);
2319 OBJSETUP(re, klass, T_REGEXP);
2321 re->ptr = 0;
2322 re->len = 0;
2323 re->str = 0;
2325 return (VALUE)re;
2328 VALUE
2329 rb_reg_new_str(VALUE s, int options)
2331 VALUE re = rb_reg_s_alloc(rb_cRegexp);
2332 onig_errmsg_buffer err = "";
2334 if (rb_reg_initialize_str(re, s, options, err) != 0) {
2335 rb_reg_raise_str(s, options, err);
2338 return re;
2341 VALUE
2342 rb_reg_new_ary(VALUE ary, int opt)
2344 return rb_reg_new_str(rb_reg_preprocess_dregexp(ary), opt);
2347 VALUE
2348 rb_enc_reg_new(const char *s, long len, rb_encoding *enc, int options)
2350 VALUE re = rb_reg_s_alloc(rb_cRegexp);
2351 onig_errmsg_buffer err = "";
2353 if (rb_reg_initialize(re, s, len, enc, options, err) != 0) {
2354 rb_enc_reg_raise(s, len, enc, options, err);
2357 return re;
2360 VALUE
2361 rb_reg_new(const char *s, long len, int options)
2363 return rb_enc_reg_new(s, len, rb_ascii8bit_encoding(), options);
2366 VALUE
2367 rb_reg_compile(VALUE str, int options)
2369 VALUE re = rb_reg_s_alloc(rb_cRegexp);
2370 onig_errmsg_buffer err = "";
2372 if (!str) str = rb_str_new(0,0);
2373 if (rb_reg_initialize_str(re, str, options, err) != 0) {
2374 rb_set_errinfo(rb_reg_error_desc(str, options, err));
2375 return Qnil;
2377 FL_SET(re, REG_LITERAL);
2378 return re;
2381 static VALUE reg_cache;
2383 VALUE
2384 rb_reg_regcomp(VALUE str)
2386 volatile VALUE save_str = str;
2387 if (reg_cache && RREGEXP(reg_cache)->len == RSTRING_LEN(str)
2388 && ENCODING_GET(reg_cache) == ENCODING_GET(str)
2389 && memcmp(RREGEXP(reg_cache)->str, RSTRING_PTR(str), RSTRING_LEN(str)) == 0)
2390 return reg_cache;
2392 return reg_cache = rb_reg_new_str(save_str, 0);
2396 * call-seq:
2397 * rxp.hash => fixnum
2399 * Produce a hash based on the text and options of this regular expression.
2402 static VALUE
2403 rb_reg_hash(VALUE re)
2405 int hashval, len;
2406 char *p;
2408 rb_reg_check(re);
2409 hashval = RREGEXP(re)->ptr->options;
2410 len = RREGEXP(re)->len;
2411 p = RREGEXP(re)->str;
2412 while (len--) {
2413 hashval = hashval * 33 + *p++;
2415 hashval = hashval + (hashval>>5);
2417 return INT2FIX(hashval);
2422 * call-seq:
2423 * rxp == other_rxp => true or false
2424 * rxp.eql?(other_rxp) => true or false
2426 * Equality---Two regexps are equal if their patterns are identical, they have
2427 * the same character set code, and their <code>casefold?</code> values are the
2428 * same.
2430 * /abc/ == /abc/x #=> false
2431 * /abc/ == /abc/i #=> false
2432 * /abc/ == /abc/n #=> false
2433 * /abc/u == /abc/n #=> false
2436 static VALUE
2437 rb_reg_equal(VALUE re1, VALUE re2)
2439 if (re1 == re2) return Qtrue;
2440 if (TYPE(re2) != T_REGEXP) return Qfalse;
2441 rb_reg_check(re1); rb_reg_check(re2);
2442 if (FL_TEST(re1, KCODE_FIXED) != FL_TEST(re2, KCODE_FIXED)) return Qfalse;
2443 if (RREGEXP(re1)->ptr->options != RREGEXP(re2)->ptr->options) return Qfalse;
2444 if (RREGEXP(re1)->len != RREGEXP(re2)->len) return Qfalse;
2445 if (ENCODING_GET(re1) != ENCODING_GET(re2)) return Qfalse;
2446 if (memcmp(RREGEXP(re1)->str, RREGEXP(re2)->str, RREGEXP(re1)->len) == 0) {
2447 return Qtrue;
2449 return Qfalse;
2452 static VALUE
2453 reg_operand(VALUE s, int check)
2455 if (SYMBOL_P(s)) {
2456 return rb_sym_to_s(s);
2458 else {
2459 VALUE tmp = rb_check_string_type(s);
2460 if (check && NIL_P(tmp)) {
2461 rb_raise(rb_eTypeError, "can't convert %s to String",
2462 rb_obj_classname(s));
2464 return tmp;
2468 static long
2469 reg_match_pos(VALUE re, VALUE *strp, long pos)
2471 VALUE str = *strp;
2473 if (NIL_P(str)) {
2474 rb_backref_set(Qnil);
2475 return -1;
2477 *strp = str = reg_operand(str, Qtrue);
2478 if (pos != 0) {
2479 if (pos < 0) {
2480 VALUE l = rb_str_length(str);
2481 pos += NUM2INT(l);
2482 if (pos < 0) {
2483 return pos;
2486 pos = rb_reg_adjust_startpos(re, str, pos, 0);
2488 return rb_reg_search(re, str, pos, 0);
2492 * call-seq:
2493 * rxp =~ str => integer or nil
2495 * Match---Matches <i>rxp</i> against <i>str</i>.
2497 * /at/ =~ "input data" #=> 7
2498 * /ax/ =~ "input data" #=> nil
2500 * If <code>=~</code> is used with a regexp literal with named captures,
2501 * captured strings (or nil) is assigned to local variables named by
2502 * the capture names.
2504 * /(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/ =~ " x = y "
2505 * p lhs #=> "x"
2506 * p rhs #=> "y"
2508 * If it is not matched, nil is assigned for the variables.
2510 * /(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/ =~ " x = "
2511 * p lhs #=> nil
2512 * p rhs #=> nil
2514 * This assignment is implemented in the Ruby parser.
2515 * So a regexp literal is required for the assignment.
2516 * The assignment is not occur if the regexp is not a literal.
2518 * re = /(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/
2519 * re =~ " x = "
2520 * p lhs # undefined local variable
2521 * p rhs # undefined local variable
2523 * A regexp interpolation, <code>#{}</code>, also disables
2524 * the assignment.
2526 * rhs_pat = /(?<rhs>\w+)/
2527 * /(?<lhs>\w+)\s*=\s*#{rhs_pat}/ =~ "x = y"
2528 * p lhs # undefined local variable
2532 VALUE
2533 rb_reg_match(VALUE re, VALUE str)
2535 long pos = reg_match_pos(re, &str, 0);
2536 if (pos < 0) return Qnil;
2537 pos = rb_str_sublen(str, pos);
2538 return LONG2FIX(pos);
2542 * call-seq:
2543 * rxp === str => true or false
2545 * Case Equality---Synonym for <code>Regexp#=~</code> used in case statements.
2547 * a = "HELLO"
2548 * case a
2549 * when /^[a-z]*$/; print "Lower case\n"
2550 * when /^[A-Z]*$/; print "Upper case\n"
2551 * else; print "Mixed case\n"
2552 * end
2554 * <em>produces:</em>
2556 * Upper case
2559 VALUE
2560 rb_reg_eqq(VALUE re, VALUE str)
2562 long start;
2564 str = reg_operand(str, Qfalse);
2565 if (NIL_P(str)) {
2566 rb_backref_set(Qnil);
2567 return Qfalse;
2569 start = rb_reg_search(re, str, 0, 0);
2570 if (start < 0) {
2571 return Qfalse;
2573 return Qtrue;
2578 * call-seq:
2579 * ~ rxp => integer or nil
2581 * Match---Matches <i>rxp</i> against the contents of <code>$_</code>.
2582 * Equivalent to <code><i>rxp</i> =~ $_</code>.
2584 * $_ = "input data"
2585 * ~ /at/ #=> 7
2588 VALUE
2589 rb_reg_match2(VALUE re)
2591 long start;
2592 VALUE line = rb_lastline_get();
2594 if (TYPE(line) != T_STRING) {
2595 rb_backref_set(Qnil);
2596 return Qnil;
2599 start = rb_reg_search(re, line, 0, 0);
2600 if (start < 0) {
2601 return Qnil;
2603 start = rb_str_sublen(line, start);
2604 return LONG2FIX(start);
2609 * call-seq:
2610 * rxp.match(str) => matchdata or nil
2611 * rxp.match(str,pos) => matchdata or nil
2613 * Returns a <code>MatchData</code> object describing the match, or
2614 * <code>nil</code> if there was no match. This is equivalent to retrieving the
2615 * value of the special variable <code>$~</code> following a normal match.
2616 * If the second parameter is present, it specifies the position in the string
2617 * to begin the search.
2619 * /(.)(.)(.)/.match("abc")[2] #=> "b"
2620 * /(.)(.)/.match("abc", 1)[2] #=> "c"
2622 * If a block is given, invoke the block with MatchData if match succeed, so
2623 * that you can write
2625 * pat.match(str) {|m| ...}
2627 * instead of
2629 * if m = pat.match(str)
2630 * ...
2631 * end
2633 * The return value is a value from block execution in this case.
2636 static VALUE
2637 rb_reg_match_m(int argc, VALUE *argv, VALUE re)
2639 VALUE result, str, initpos;
2640 long pos;
2642 if (rb_scan_args(argc, argv, "11", &str, &initpos) == 2) {
2643 pos = NUM2LONG(initpos);
2645 else {
2646 pos = 0;
2649 pos = reg_match_pos(re, &str, pos);
2650 if (pos < 0) {
2651 rb_backref_set(Qnil);
2652 return Qnil;
2654 result = rb_backref_get();
2655 rb_match_busy(result);
2656 if (!NIL_P(result) && rb_block_given_p()) {
2657 return rb_yield(result);
2659 return result;
2663 * Document-method: compile
2665 * Synonym for <code>Regexp.new</code>
2670 * call-seq:
2671 * Regexp.new(string [, options]) => regexp
2672 * Regexp.new(regexp) => regexp
2673 * Regexp.compile(string [, options]) => regexp
2674 * Regexp.compile(regexp) => regexp
2676 * Constructs a new regular expression from <i>pattern</i>, which can be either
2677 * a <code>String</code> or a <code>Regexp</code> (in which case that regexp's
2678 * options are propagated, and new options may not be specified (a change as of
2679 * Ruby 1.8). If <i>options</i> is a <code>Fixnum</code>, it should be one or
2680 * more of the constants <code>Regexp::EXTENDED</code>,
2681 * <code>Regexp::IGNORECASE</code>, and <code>Regexp::MULTILINE</code>,
2682 * <em>or</em>-ed together. Otherwise, if <i>options</i> is not
2683 * <code>nil</code>, the regexp will be case insensitive.
2685 * r1 = Regexp.new('^a-z+:\\s+\w+') #=> /^a-z+:\s+\w+/
2686 * r2 = Regexp.new('cat', true) #=> /cat/i
2687 * r3 = Regexp.new('dog', Regexp::EXTENDED) #=> /dog/x
2688 * r4 = Regexp.new(r2) #=> /cat/i
2691 static VALUE
2692 rb_reg_initialize_m(int argc, VALUE *argv, VALUE self)
2694 onig_errmsg_buffer err = "";
2695 int flags = 0;
2696 VALUE str;
2697 rb_encoding *enc;
2698 const char *ptr;
2699 long len;
2701 if (argc == 0 || argc > 3) {
2702 rb_raise(rb_eArgError, "wrong number of arguments");
2704 if (TYPE(argv[0]) == T_REGEXP) {
2705 VALUE re = argv[0];
2707 if (argc > 1) {
2708 rb_warn("flags ignored");
2710 rb_reg_check(re);
2711 flags = rb_reg_options(re);
2712 ptr = RREGEXP(re)->str;
2713 len = RREGEXP(re)->len;
2714 enc = rb_enc_get(re);
2715 if (rb_reg_initialize(self, ptr, len, enc, flags, err)) {
2716 str = rb_enc_str_new(ptr, len, enc);
2717 rb_reg_raise_str(str, flags, err);
2720 else {
2721 if (argc >= 2) {
2722 if (FIXNUM_P(argv[1])) flags = FIX2INT(argv[1]);
2723 else if (RTEST(argv[1])) flags = ONIG_OPTION_IGNORECASE;
2725 enc = 0;
2726 if (argc == 3 && !NIL_P(argv[2])) {
2727 char *kcode = StringValuePtr(argv[2]);
2728 if (kcode[0] == 'n' || kcode[1] == 'N') {
2729 enc = rb_ascii8bit_encoding();
2730 flags |= ARG_ENCODING_FIXED;
2732 else {
2733 rb_warning("encoding option is obsolete - %s", kcode);
2736 str = argv[0];
2737 ptr = StringValuePtr(str);
2738 if (enc
2739 ? rb_reg_initialize(self, ptr, RSTRING_LEN(str), enc, flags, err)
2740 : rb_reg_initialize_str(self, str, flags, err)) {
2741 rb_reg_raise_str(str, flags, err);
2744 return self;
2747 VALUE
2748 rb_reg_quote(VALUE str)
2750 rb_encoding *enc = rb_enc_get(str);
2751 char *s, *send, *t;
2752 VALUE tmp;
2753 int c, clen;
2754 int ascii_only = rb_enc_str_asciionly_p(str);
2756 s = RSTRING_PTR(str);
2757 send = s + RSTRING_LEN(str);
2758 while (s < send) {
2759 c = rb_enc_ascget(s, send, &clen, enc);
2760 if (c == -1) {
2761 s += mbclen(s, send, enc);
2762 continue;
2764 switch (c) {
2765 case '[': case ']': case '{': case '}':
2766 case '(': case ')': case '|': case '-':
2767 case '*': case '.': case '\\':
2768 case '?': case '+': case '^': case '$':
2769 case ' ': case '#':
2770 case '\t': case '\f': case '\v': case '\n': case '\r':
2771 goto meta_found;
2773 s += clen;
2775 tmp = rb_str_new3(str);
2776 if (ascii_only) {
2777 rb_enc_associate(tmp, rb_usascii_encoding());
2779 return tmp;
2781 meta_found:
2782 tmp = rb_str_new(0, RSTRING_LEN(str)*2);
2783 if (ascii_only) {
2784 rb_enc_associate(tmp, rb_usascii_encoding());
2786 else {
2787 rb_enc_copy(tmp, str);
2789 t = RSTRING_PTR(tmp);
2790 /* copy upto metacharacter */
2791 memcpy(t, RSTRING_PTR(str), s - RSTRING_PTR(str));
2792 t += s - RSTRING_PTR(str);
2794 while (s < send) {
2795 c = rb_enc_ascget(s, send, &clen, enc);
2796 if (c == -1) {
2797 int n = mbclen(s, send, enc);
2799 while (n--)
2800 *t++ = *s++;
2801 continue;
2803 s += clen;
2804 switch (c) {
2805 case '[': case ']': case '{': case '}':
2806 case '(': case ')': case '|': case '-':
2807 case '*': case '.': case '\\':
2808 case '?': case '+': case '^': case '$':
2809 case '#':
2810 *t++ = '\\';
2811 break;
2812 case ' ':
2813 *t++ = '\\';
2814 *t++ = ' ';
2815 continue;
2816 case '\t':
2817 *t++ = '\\';
2818 *t++ = 't';
2819 continue;
2820 case '\n':
2821 *t++ = '\\';
2822 *t++ = 'n';
2823 continue;
2824 case '\r':
2825 *t++ = '\\';
2826 *t++ = 'r';
2827 continue;
2828 case '\f':
2829 *t++ = '\\';
2830 *t++ = 'f';
2831 continue;
2832 case '\v':
2833 *t++ = '\\';
2834 *t++ = 'v';
2835 continue;
2837 *t++ = c;
2839 rb_str_resize(tmp, t - RSTRING_PTR(tmp));
2840 OBJ_INFECT(tmp, str);
2841 return tmp;
2846 * call-seq:
2847 * Regexp.escape(str) => string
2848 * Regexp.quote(str) => string
2850 * Escapes any characters that would have special meaning in a regular
2851 * expression. Returns a new escaped string, or self if no characters are
2852 * escaped. For any string,
2853 * <code>Regexp.new(Regexp.escape(<i>str</i>))=~<i>str</i></code> will be true.
2855 * Regexp.escape('\*?{}.') #=> \\\*\?\{\}\.
2859 static VALUE
2860 rb_reg_s_quote(VALUE c, VALUE str)
2862 return rb_reg_quote(reg_operand(str, Qtrue));
2866 rb_reg_options(VALUE re)
2868 int options;
2870 rb_reg_check(re);
2871 options = RREGEXP(re)->ptr->options & ARG_REG_OPTION_MASK;
2872 if (RBASIC(re)->flags & KCODE_FIXED) options |= ARG_ENCODING_FIXED;
2873 if (RBASIC(re)->flags & REG_ENCODING_NONE) options |= ARG_ENCODING_NONE;
2874 return options;
2877 VALUE
2878 rb_check_regexp_type(VALUE re)
2880 return rb_check_convert_type(re, T_REGEXP, "Regexp", "to_regexp");
2884 * call-seq:
2885 * Regexp.try_convert(obj) -> re or nil
2887 * Try to convert <i>obj</i> into a Regexp, using to_regexp method.
2888 * Returns converted regexp or nil if <i>obj</i> cannot be converted
2889 * for any reason.
2891 * Regexp.try_convert(/re/) #=> /re/
2892 * Regexp.try_convert("re") #=> nil
2894 * o = Object.new
2895 * Regexp.try_convert(o) #=> nil
2896 * def o.to_regexp() /foo/ end
2897 * Regexp.try_convert(o) #=> /foo/
2900 static VALUE
2901 rb_reg_s_try_convert(VALUE dummy, VALUE re)
2903 return rb_check_regexp_type(re);
2906 static VALUE
2907 rb_reg_s_union(VALUE self, VALUE args0)
2909 long argc = RARRAY_LEN(args0);
2911 if (argc == 0) {
2912 VALUE args[1];
2913 args[0] = rb_str_new2("(?!)");
2914 return rb_class_new_instance(1, args, rb_cRegexp);
2916 else if (argc == 1) {
2917 VALUE arg = rb_ary_entry(args0, 0);
2918 VALUE re = rb_check_regexp_type(arg);
2919 if (!NIL_P(re))
2920 return re;
2921 else {
2922 VALUE quoted;
2923 quoted = rb_reg_s_quote(Qnil, arg);
2924 return rb_reg_new_str(quoted, 0);
2927 else {
2928 int i;
2929 VALUE source = rb_str_buf_new(0);
2930 rb_encoding *result_enc;
2932 int has_asciionly = 0;
2933 rb_encoding *has_ascii_compat_fixed = 0;
2934 rb_encoding *has_ascii_incompat = 0;
2936 for (i = 0; i < argc; i++) {
2937 volatile VALUE v;
2938 VALUE e = rb_ary_entry(args0, i);
2940 if (0 < i)
2941 rb_str_buf_cat_ascii(source, "|");
2943 v = rb_check_regexp_type(e);
2944 if (!NIL_P(v)) {
2945 rb_encoding *enc = rb_enc_get(v);
2946 if (!rb_enc_asciicompat(enc)) {
2947 if (!has_ascii_incompat)
2948 has_ascii_incompat = enc;
2949 else if (has_ascii_incompat != enc)
2950 rb_raise(rb_eArgError, "incompatible encodings: %s and %s",
2951 rb_enc_name(has_ascii_incompat), rb_enc_name(enc));
2953 else if (rb_reg_fixed_encoding_p(v)) {
2954 if (!has_ascii_compat_fixed)
2955 has_ascii_compat_fixed = enc;
2956 else if (has_ascii_compat_fixed != enc)
2957 rb_raise(rb_eArgError, "incompatible encodings: %s and %s",
2958 rb_enc_name(has_ascii_compat_fixed), rb_enc_name(enc));
2960 else {
2961 has_asciionly = 1;
2963 v = rb_reg_to_s(v);
2965 else {
2966 rb_encoding *enc = rb_enc_get(e);
2967 StringValue(e);
2968 enc = rb_enc_get(e);
2969 if (!rb_enc_str_asciicompat_p(e)) {
2970 if (!has_ascii_incompat)
2971 has_ascii_incompat = enc;
2972 else if (has_ascii_incompat != enc)
2973 rb_raise(rb_eArgError, "incompatible encodings: %s and %s",
2974 rb_enc_name(has_ascii_incompat), rb_enc_name(enc));
2976 else if (rb_enc_str_asciionly_p(e)) {
2977 has_asciionly = 1;
2979 else {
2980 if (!has_ascii_compat_fixed)
2981 has_ascii_compat_fixed = enc;
2982 else if (has_ascii_compat_fixed != enc)
2983 rb_raise(rb_eArgError, "incompatible encodings: %s and %s",
2984 rb_enc_name(has_ascii_compat_fixed), rb_enc_name(enc));
2986 v = rb_reg_s_quote(Qnil, e);
2988 if (has_ascii_incompat) {
2989 if (has_asciionly) {
2990 rb_raise(rb_eArgError, "ASCII incompatible encoding: %s",
2991 rb_enc_name(has_ascii_incompat));
2993 if (has_ascii_compat_fixed) {
2994 rb_raise(rb_eArgError, "incompatible encodings: %s and %s",
2995 rb_enc_name(has_ascii_incompat), rb_enc_name(has_ascii_compat_fixed));
2999 if (i == 0) {
3000 rb_enc_copy(source, v);
3002 rb_str_append(source, v);
3005 if (has_ascii_incompat) {
3006 result_enc = has_ascii_incompat;
3008 else if (has_ascii_compat_fixed) {
3009 result_enc = has_ascii_compat_fixed;
3011 else {
3012 result_enc = rb_ascii8bit_encoding();
3015 rb_enc_associate(source, result_enc);
3016 return rb_class_new_instance(1, &source, rb_cRegexp);
3021 * call-seq:
3022 * Regexp.union(pat1, pat2, ...) => new_regexp
3023 * Regexp.union(pats_ary) => new_regexp
3025 * Return a <code>Regexp</code> object that is the union of the given
3026 * <em>pattern</em>s, i.e., will match any of its parts. The <em>pattern</em>s
3027 * can be Regexp objects, in which case their options will be preserved, or
3028 * Strings. If no patterns are given, returns <code>/(?!)/</code>.
3030 * Regexp.union #=> /(?!)/
3031 * Regexp.union("penzance") #=> /penzance/
3032 * Regexp.union("a+b*c") #=> /a\+b\*c/
3033 * Regexp.union("skiing", "sledding") #=> /skiing|sledding/
3034 * Regexp.union(["skiing", "sledding"]) #=> /skiing|sledding/
3035 * Regexp.union(/dogs/, /cats/i) #=> /(?-mix:dogs)|(?i-mx:cats)/
3037 static VALUE
3038 rb_reg_s_union_m(VALUE self, VALUE args)
3040 VALUE v;
3041 if (RARRAY_LEN(args) == 1 &&
3042 !NIL_P(v = rb_check_array_type(rb_ary_entry(args, 0)))) {
3043 return rb_reg_s_union(self, v);
3045 return rb_reg_s_union(self, args);
3048 /* :nodoc: */
3049 static VALUE
3050 rb_reg_init_copy(VALUE copy, VALUE re)
3052 onig_errmsg_buffer err = "";
3053 const char *s;
3054 long len;
3056 if (copy == re) return copy;
3057 rb_check_frozen(copy);
3058 /* need better argument type check */
3059 if (!rb_obj_is_instance_of(re, rb_obj_class(copy))) {
3060 rb_raise(rb_eTypeError, "wrong argument type");
3062 rb_reg_check(re);
3063 s = RREGEXP(re)->str;
3064 len = RREGEXP(re)->len;
3065 if (rb_reg_initialize(copy, s, len, rb_enc_get(re), rb_reg_options(re), err) != 0) {
3066 rb_reg_raise(s, len, err, re);
3068 return copy;
3071 VALUE
3072 rb_reg_regsub(VALUE str, VALUE src, struct re_registers *regs, VALUE regexp)
3074 VALUE val = 0;
3075 char *p, *s, *e;
3076 int no, clen;
3077 rb_encoding *str_enc = rb_enc_get(str);
3078 rb_encoding *src_enc = rb_enc_get(src);
3080 p = s = RSTRING_PTR(str);
3081 e = s + RSTRING_LEN(str);
3083 while (s < e) {
3084 int c = rb_enc_ascget(s, e, &clen, str_enc);
3085 char *ss;
3087 if (c == -1) {
3088 s += mbclen(s, e, str_enc);
3089 continue;
3091 ss = s;
3092 s += clen;
3094 if (c != '\\' || s == e) continue;
3096 if (!val) {
3097 val = rb_str_buf_new(ss-p);
3099 rb_enc_str_buf_cat(val, p, ss-p, str_enc);
3101 c = rb_enc_ascget(s, e, &clen, str_enc);
3102 if (c == -1) {
3103 s += mbclen(s, e, str_enc);
3104 rb_enc_str_buf_cat(val, ss, s-ss, str_enc);
3105 p = s;
3106 continue;
3108 s += clen;
3110 p = s;
3111 switch (c) {
3112 case '1': case '2': case '3': case '4':
3113 case '5': case '6': case '7': case '8': case '9':
3114 if (onig_noname_group_capture_is_active(RREGEXP(regexp)->ptr)) {
3115 no = c - '0';
3117 else {
3118 continue;
3120 break;
3122 case 'k':
3123 if (s < e && rb_enc_ascget(s, e, &clen, str_enc) == '<') {
3124 char *name, *name_end;
3126 name_end = name = s + clen;
3127 while (name_end < e) {
3128 c = rb_enc_ascget(name_end, e, &clen, str_enc);
3129 if (c == '>') break;
3130 name_end += c == -1 ? mbclen(name_end, e, str_enc) : clen;
3132 if (name_end < e) {
3133 no = name_to_backref_number(regs, regexp, name, name_end);
3134 p = s = name_end + clen;
3135 break;
3137 else {
3138 rb_raise(rb_eRuntimeError, "invalid group name reference format");
3142 rb_enc_str_buf_cat(val, ss, s-ss, str_enc);
3143 continue;
3145 case '0':
3146 case '&':
3147 no = 0;
3148 break;
3150 case '`':
3151 rb_enc_str_buf_cat(val, RSTRING_PTR(src), BEG(0), src_enc);
3152 continue;
3154 case '\'':
3155 rb_enc_str_buf_cat(val, RSTRING_PTR(src)+END(0), RSTRING_LEN(src)-END(0), src_enc);
3156 continue;
3158 case '+':
3159 no = regs->num_regs-1;
3160 while (BEG(no) == -1 && no > 0) no--;
3161 if (no == 0) continue;
3162 break;
3164 case '\\':
3165 rb_enc_str_buf_cat(val, s-clen, clen, str_enc);
3166 continue;
3168 default:
3169 rb_enc_str_buf_cat(val, ss, s-ss, str_enc);
3170 continue;
3173 if (no >= 0) {
3174 if (no >= regs->num_regs) continue;
3175 if (BEG(no) == -1) continue;
3176 rb_enc_str_buf_cat(val, RSTRING_PTR(src)+BEG(no), END(no)-BEG(no), src_enc);
3180 if (!val) return str;
3181 if (p < e) {
3182 rb_enc_str_buf_cat(val, p, e-p, str_enc);
3185 return val;
3188 static VALUE
3189 kcode_getter(void)
3191 rb_warn("variable $KCODE is no longer effective");
3192 return Qnil;
3195 static void
3196 kcode_setter(VALUE val, ID id)
3198 rb_warn("variable $KCODE is no longer effective; ignored");
3201 static VALUE
3202 ignorecase_getter(void)
3204 rb_warn("variable $= is no longer effective");
3205 return Qfalse;
3208 static void
3209 ignorecase_setter(VALUE val, ID id)
3211 rb_warn("variable $= is no longer effective; ignored");
3214 static VALUE
3215 match_getter(void)
3217 VALUE match = rb_backref_get();
3219 if (NIL_P(match)) return Qnil;
3220 rb_match_busy(match);
3221 return match;
3224 static void
3225 match_setter(VALUE val)
3227 if (!NIL_P(val)) {
3228 Check_Type(val, T_MATCH);
3230 rb_backref_set(val);
3234 * call-seq:
3235 * Regexp.last_match => matchdata
3236 * Regexp.last_match(n) => str
3238 * The first form returns the <code>MatchData</code> object generated by the
3239 * last successful pattern match. Equivalent to reading the global variable
3240 * <code>$~</code>. The second form returns the <i>n</i>th field in this
3241 * <code>MatchData</code> object.
3242 * <em>n</em> can be a string or symbol to reference a named capture.
3244 * /c(.)t/ =~ 'cat' #=> 0
3245 * Regexp.last_match #=> #<MatchData "cat" 1:"a">
3246 * Regexp.last_match(0) #=> "cat"
3247 * Regexp.last_match(1) #=> "a"
3248 * Regexp.last_match(2) #=> nil
3250 * /(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/ =~ "var = val"
3251 * Regexp.last_match #=> #<MatchData "var = val" lhs:"var" rhs:"val">
3252 * Regexp.last_match(:lhs) #=> "var"
3253 * Regexp.last_match(:rhs) #=> "val"
3256 static VALUE
3257 rb_reg_s_last_match(int argc, VALUE *argv)
3259 VALUE nth;
3261 if (argc > 0 && rb_scan_args(argc, argv, "01", &nth) == 1) {
3262 VALUE match = rb_backref_get();
3263 int n;
3264 if (NIL_P(match)) return Qnil;
3265 n = match_backref_number(match, nth);
3266 return rb_reg_nth_match(n, match);
3268 return match_getter();
3271 static void
3272 re_warn(const char *s)
3274 rb_warn("%s", s);
3278 * Document-class: Regexp
3280 * A <code>Regexp</code> holds a regular expression, used to match a pattern
3281 * against strings. Regexps are created using the <code>/.../</code> and
3282 * <code>%r{...}</code> literals, and by the <code>Regexp::new</code>
3283 * constructor.
3287 void
3288 Init_Regexp(void)
3290 rb_eRegexpError = rb_define_class("RegexpError", rb_eStandardError);
3292 onigenc_set_default_caseconv_table((UChar*)casetable);
3293 onigenc_set_default_encoding(ONIG_ENCODING_ASCII);
3294 onig_set_warn_func(re_warn);
3295 onig_set_verb_warn_func(re_warn);
3297 rb_define_virtual_variable("$~", match_getter, match_setter);
3298 rb_define_virtual_variable("$&", last_match_getter, 0);
3299 rb_define_virtual_variable("$`", prematch_getter, 0);
3300 rb_define_virtual_variable("$'", postmatch_getter, 0);
3301 rb_define_virtual_variable("$+", last_paren_match_getter, 0);
3303 rb_define_virtual_variable("$=", ignorecase_getter, ignorecase_setter);
3304 rb_define_virtual_variable("$KCODE", kcode_getter, kcode_setter);
3305 rb_define_virtual_variable("$-K", kcode_getter, kcode_setter);
3307 rb_cRegexp = rb_define_class("Regexp", rb_cObject);
3308 rb_define_alloc_func(rb_cRegexp, rb_reg_s_alloc);
3309 rb_define_singleton_method(rb_cRegexp, "compile", rb_class_new_instance, -1);
3310 rb_define_singleton_method(rb_cRegexp, "quote", rb_reg_s_quote, 1);
3311 rb_define_singleton_method(rb_cRegexp, "escape", rb_reg_s_quote, 1);
3312 rb_define_singleton_method(rb_cRegexp, "union", rb_reg_s_union_m, -2);
3313 rb_define_singleton_method(rb_cRegexp, "last_match", rb_reg_s_last_match, -1);
3314 rb_define_singleton_method(rb_cRegexp, "try_convert", rb_reg_s_try_convert, 1);
3316 rb_define_method(rb_cRegexp, "initialize", rb_reg_initialize_m, -1);
3317 rb_define_method(rb_cRegexp, "initialize_copy", rb_reg_init_copy, 1);
3318 rb_define_method(rb_cRegexp, "hash", rb_reg_hash, 0);
3319 rb_define_method(rb_cRegexp, "eql?", rb_reg_equal, 1);
3320 rb_define_method(rb_cRegexp, "==", rb_reg_equal, 1);
3321 rb_define_method(rb_cRegexp, "=~", rb_reg_match, 1);
3322 rb_define_method(rb_cRegexp, "===", rb_reg_eqq, 1);
3323 rb_define_method(rb_cRegexp, "~", rb_reg_match2, 0);
3324 rb_define_method(rb_cRegexp, "match", rb_reg_match_m, -1);
3325 rb_define_method(rb_cRegexp, "to_s", rb_reg_to_s, 0);
3326 rb_define_method(rb_cRegexp, "inspect", rb_reg_inspect, 0);
3327 rb_define_method(rb_cRegexp, "source", rb_reg_source, 0);
3328 rb_define_method(rb_cRegexp, "casefold?", rb_reg_casefold_p, 0);
3329 rb_define_method(rb_cRegexp, "options", rb_reg_options_m, 0);
3330 rb_define_method(rb_cRegexp, "encoding", rb_obj_encoding, 0); /* in encoding.c */
3331 rb_define_method(rb_cRegexp, "fixed_encoding?", rb_reg_fixed_encoding_p, 0);
3332 rb_define_method(rb_cRegexp, "names", rb_reg_names, 0);
3333 rb_define_method(rb_cRegexp, "named_captures", rb_reg_named_captures, 0);
3335 rb_define_const(rb_cRegexp, "IGNORECASE", INT2FIX(ONIG_OPTION_IGNORECASE));
3336 rb_define_const(rb_cRegexp, "EXTENDED", INT2FIX(ONIG_OPTION_EXTEND));
3337 rb_define_const(rb_cRegexp, "MULTILINE", INT2FIX(ONIG_OPTION_MULTILINE));
3339 rb_global_variable(&reg_cache);
3341 rb_cMatch = rb_define_class("MatchData", rb_cObject);
3342 rb_define_alloc_func(rb_cMatch, match_alloc);
3343 rb_undef_method(CLASS_OF(rb_cMatch), "new");
3345 rb_define_method(rb_cMatch, "initialize_copy", match_init_copy, 1);
3346 rb_define_method(rb_cMatch, "regexp", match_regexp, 0);
3347 rb_define_method(rb_cMatch, "names", match_names, 0);
3348 rb_define_method(rb_cMatch, "size", match_size, 0);
3349 rb_define_method(rb_cMatch, "length", match_size, 0);
3350 rb_define_method(rb_cMatch, "offset", match_offset, 1);
3351 rb_define_method(rb_cMatch, "begin", match_begin, 1);
3352 rb_define_method(rb_cMatch, "end", match_end, 1);
3353 rb_define_method(rb_cMatch, "to_a", match_to_a, 0);
3354 rb_define_method(rb_cMatch, "[]", match_aref, -1);
3355 rb_define_method(rb_cMatch, "captures", match_captures, 0);
3356 rb_define_method(rb_cMatch, "values_at", match_values_at, -1);
3357 rb_define_method(rb_cMatch, "pre_match", rb_reg_match_pre, 0);
3358 rb_define_method(rb_cMatch, "post_match", rb_reg_match_post, 0);
3359 rb_define_method(rb_cMatch, "to_s", match_to_s, 0);
3360 rb_define_method(rb_cMatch, "inspect", match_inspect, 0);
3361 rb_define_method(rb_cMatch, "string", match_string, 0);