re.c

   1 /**********************************************************************
   2
   3   re.c -
   4
   5   $Author$
   6   created at: Mon Aug  9 18:24:49 JST 1993
   7
   8   Copyright (C) 1993-2007 Yukihiro Matsumoto
   9
  10 **********************************************************************/
  11
  12 #include "ruby/ruby.h"
  13 #include "ruby/re.h"
  14 #include "ruby/encoding.h"
  15 #include "ruby/util.h"
  16 #include "regint.h"
  17 #include <ctype.h>
  18
  19 VALUE rb_eRegexpError;
  20
  21 typedef char onig_errmsg_buffer[ONIG_MAX_ERROR_MESSAGE_LEN];
  22
  23 #define BEG(no) regs->beg[no]
  24 #define END(no) regs->end[no]
  25
  26 #if 'a' == 97   /* it's ascii */
  27 static const char casetable[] = {
  28         '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
  29         '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
  30         '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
  31         '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
  32         /* ' '     '!'     '"'     '#'     '$'     '%'     '&'     ''' */
  33         '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
  34         /* '('     ')'     '*'     '+'     ','     '-'     '.'     '/' */
  35         '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
  36         /* '0'     '1'     '2'     '3'     '4'     '5'     '6'     '7' */
  37         '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
  38         /* '8'     '9'     ':'     ';'     '<'     '='     '>'     '?' */
  39         '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
  40         /* '@'     'A'     'B'     'C'     'D'     'E'     'F'     'G' */
  41         '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
  42         /* 'H'     'I'     'J'     'K'     'L'     'M'     'N'     'O' */
  43         '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
  44         /* 'P'     'Q'     'R'     'S'     'T'     'U'     'V'     'W' */
  45         '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
  46         /* 'X'     'Y'     'Z'     '['     '\'     ']'     '^'     '_' */
  47         '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137',
  48         /* '`'     'a'     'b'     'c'     'd'     'e'     'f'     'g' */
  49         '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
  50         /* 'h'     'i'     'j'     'k'     'l'     'm'     'n'     'o' */
  51         '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
  52         /* 'p'     'q'     'r'     's'     't'     'u'     'v'     'w' */
  53         '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
  54         /* 'x'     'y'     'z'     '{'     '|'     '}'     '~' */
  55         '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177',
  56         '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
  57         '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
  58         '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
  59         '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
  60         '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
  61         '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
  62         '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
  63         '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
  64         '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
  65         '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
  66         '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327',
  67         '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337',
  68         '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
  69         '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
  70         '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
  71         '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377',
  72 };
  73 #else
  74 # error >>> "You lose. You will need a translation table for your character set." <<<
  75 #endif
  76
  77 int
  78 rb_memcicmp(const void *x, const void *y, long len)
  79 {
  80     const unsigned char *p1 = x, *p2 = y;
  81     int tmp;
  82
  83     while (len--) {
  84         if ((tmp = casetable[(unsigned)*p1++] - casetable[(unsigned)*p2++]))
  85             return tmp;
  86     }
  87     return 0;
  88 }
  89
  90 #undef rb_memcmp
  91
  92 int
  93 rb_memcmp(const void *p1, const void *p2, long len)
  94 {
  95     return memcmp(p1, p2, len);
  96 }
  97
  98 static inline long
  99 rb_memsearch_ss(const unsigned char *xs, long m, const unsigned char *ys, long n)
 100 {
 101     const unsigned char *x = xs, *xe = xs + m;
 102     const unsigned char *y = ys, *ye = ys + n;
 103 #ifndef VALUE_MAX
 104 # if SIZEOF_VALUE == 8
 105 #  define VALUE_MAX 0xFFFFFFFFFFFFFFFFULL
 106 # elif SIZEOF_VALUE == 4
 107 #  define VALUE_MAX 0xFFFFFFFFUL
 108 # endif
 109 #endif
 110     VALUE hx, hy, mask = VALUE_MAX >> ((SIZEOF_VALUE - m) * CHAR_BIT);
 111
 112     if (m > SIZEOF_VALUE)
 113         rb_bug("!!too long pattern string!!");
 114
 115         /* Prepare hash value */
 116     for (hx = *x++, hy = *y++; x < xe; ++x, ++y) {
 117         hx <<= CHAR_BIT;
 118         hy <<= CHAR_BIT;
 119         hx |= *x;
 120         hy |= *y;
 121     }
 122     /* Searching */
 123     while (hx != hy) {
 124         if (y == ye)
 125             return -1;
 126         hy <<= CHAR_BIT;
 127         hy |= *y;
 128         hy &= mask;
 129         y++;
 130     }
 131     return y - ys - m;
 132 }
 133
 134 static inline long
 135 rb_memsearch_qs(const unsigned char *xs, long m, const unsigned char *ys, long n)
 136 {
 137     const unsigned char *x = xs, *xe = xs + m;
 138     const unsigned char *y = ys;
 139     VALUE i, qstable[256];
 140
 141     /* Preprocessing */
 142     for (i = 0; i < 256; ++i)
 143         qstable[i] = m + 1;
 144     for (; x < xe; ++x)
 145         qstable[*x] = xe - x;
 146     /* Searching */
 147     for (; y + m <= ys + n; y += *(qstable + y[m])) {
 148         if (*xs == *y && memcmp(xs, y, m) == 0)
 149             return y - ys;
 150     }
 151     return -1;
 152 }
 153
 154 static inline unsigned int
 155 rb_memsearch_qs_utf8_hash(const unsigned char *x)
 156 {
 157     register const unsigned int mix = 8353;
 158     register unsigned int h = *x;
 159     if (h < 0xC0) {
 160         return h + 256;
 161     }
 162     else if (h < 0xE0) {
 163         h *= mix;
 164         h += x[1];
 165     }
 166     else if (h < 0xF0) {
 167         h *= mix;
 168         h += x[1];
 169         h *= mix;
 170         h += x[2];
 171     }
 172     else if (h < 0xF5) {
 173         h *= mix;
 174         h += x[1];
 175         h *= mix;
 176         h += x[2];
 177         h *= mix;
 178         h += x[3];
 179     }
 180     else {
 181         return h + 256;
 182     }
 183     return (unsigned char)h;
 184 }
 185
 186 static inline long
 187 rb_memsearch_qs_utf8(const unsigned char *xs, long m, const unsigned char *ys, long n)
 188 {
 189     const unsigned char *x = xs, *xe = xs + m;
 190     const unsigned char *y = ys;
 191     VALUE i, qstable[512];
 192
 193     /* Preprocessing */
 194     for (i = 0; i < 512; ++i) {
 195         qstable[i] = m + 1;
 196     }
 197     for (; x < xe; ++x) {
 198         qstable[rb_memsearch_qs_utf8_hash(x)] = xe - x;
 199     }
 200     /* Searching */
 201     for (; y + m <= ys + n; y += qstable[rb_memsearch_qs_utf8_hash(y+m)]) {
 202         if (*xs == *y && memcmp(xs, y, m) == 0)
 203             return y - ys;
 204     }
 205     return -1;
 206 }
 207
 208 long
 209 rb_memsearch(const void *x0, long m, const void *y0, long n, rb_encoding *enc)
 210 {
 211     const unsigned char *x = x0, *y = y0;
 212
 213     if (m > n) return -1;
 214     else if (m == n) {
 215         return memcmp(x0, y0, m) == 0 ? 0 : -1;
 216     }
 217     else if (m < 1) {
 218         return 0;
 219     }
 220     else if (m == 1) {
 221         const unsigned char *ys = y, *ye = ys + n;
 222         for (; y < ye; ++y) {
 223             if (*x == *y)
 224                 return y - ys;
 225         }
 226         return -1;
 227     }
 228     else if (m <= SIZEOF_VALUE) {
 229         return rb_memsearch_ss(x0, m, y0, n);
 230     }
 231     else if (enc == rb_utf8_encoding()){
 232         return rb_memsearch_qs_utf8(x0, m, y0, n);
 233     }
 234     else {
 235         return rb_memsearch_qs(x0, m, y0, n);
 236     }
 237 }
 238
 239 #define REG_LITERAL FL_USER5
 240 #define REG_ENCODING_NONE FL_USER6
 241 #define REG_BUSY FL_USER7
 242
 243 #define KCODE_FIXED FL_USER4
 244
 245 #define ARG_REG_OPTION_MASK \
 246     (ONIG_OPTION_IGNORECASE|ONIG_OPTION_MULTILINE|ONIG_OPTION_EXTEND)
 247 #define ARG_ENCODING_FIXED    16
 248 #define ARG_ENCODING_NONE     32
 249
 250 static int
 251 char_to_option(int c)
 252 {
 253     int val;
 254
 255     switch (c) {
 256       case 'i':
 257         val = ONIG_OPTION_IGNORECASE;
 258         break;
 259       case 'x':
 260         val = ONIG_OPTION_EXTEND;
 261         break;
 262       case 'm':
 263         val = ONIG_OPTION_MULTILINE;
 264         break;
 265       default:
 266         val = 0;
 267         break;
 268     }
 269     return val;
 270 }
 271
 272 static char *
 273 option_to_str(char str[4], int options)
 274 {
 275     char *p = str;
 276     if (options & ONIG_OPTION_MULTILINE) *p++ = 'm';
 277     if (options & ONIG_OPTION_IGNORECASE) *p++ = 'i';
 278     if (options & ONIG_OPTION_EXTEND) *p++ = 'x';
 279     *p = 0;
 280     return str;
 281 }
 282
 283 extern int
 284 rb_char_to_option_kcode(int c, int *option, int *kcode)
 285 {
 286     *option = 0;
 287
 288     switch (c) {
 289       case 'n':
 290         *kcode = -1;
 291         return (*option = ARG_ENCODING_NONE);
 292       case 'e':
 293         *kcode = rb_enc_find_index("EUC-JP");
 294         break;
 295       case 's':
 296         *kcode = rb_enc_find_index("Windows-31J");
 297         break;
 298       case 'u':
 299         *kcode = rb_enc_find_index("UTF-8");
 300         break;
 301       default:
 302         *kcode = -1;
 303         return (*option = char_to_option(c));
 304     }
 305     *option = ARG_ENCODING_FIXED;
 306     return 1;
 307 }
 308
 309 static void
 310 rb_reg_check(VALUE re)
 311 {
 312     if (!RREGEXP(re)->ptr || !RREGEXP(re)->str) {
 313         rb_raise(rb_eTypeError, "uninitialized Regexp");
 314     }
 315 }
 316
 317 static void
 318 rb_reg_expr_str(VALUE str, const char *s, long len)
 319 {
 320     rb_encoding *enc = rb_enc_get(str);
 321     const char *p, *pend;
 322     int need_escape = 0;
 323     int c, clen;
 324
 325     p = s; pend = p + len;
 326     while (p<pend) {
 327         c = rb_enc_ascget(p, pend, &clen, enc);
 328         if (c == -1) {
 329             p += mbclen(p, pend, enc);
 330         }
 331         else if (c != '/' && rb_enc_isprint(c, enc)) {
 332             p += clen;
 333         }
 334         else {
 335             need_escape = 1;
 336             break;
 337         }
 338     }
 339     if (!need_escape) {
 340         rb_str_buf_cat(str, s, len);
 341     }
 342     else {
 343         p = s;
 344         while (p<pend) {
 345             c = rb_enc_ascget(p, pend, &clen, enc);
 346             if (c == '\\' && p+clen < pend) {
 347                 int n = clen + mbclen(p+clen, pend, enc);
 348                 rb_str_buf_cat(str, p, n);
 349                 p += n;
 350                 continue;
 351             }
 352             else if (c == '/') {
 353                 char c = '\\';
 354                 rb_str_buf_cat(str, &c, 1);
 355                 rb_str_buf_cat(str, p, clen);
 356             }
 357             else if (c == -1) {
 358                 int l = mbclen(p, pend, enc);
 359                 rb_str_buf_cat(str, p, l);
 360                 p += l;
 361                 continue;
 362             }
 363             else if (rb_enc_isprint(c, enc)) {
 364                 rb_str_buf_cat(str, p, clen);
 365             }
 366             else if (!rb_enc_isspace(c, enc)) {
 367                 char b[8];
 368
 369                 sprintf(b, "\\x%02X", c);
 370                 rb_str_buf_cat(str, b, 4);
 371             }
 372             else {
 373                 rb_str_buf_cat(str, p, clen);
 374             }
 375             p += clen;
 376         }
 377     }
 378 }
 379
 380 static VALUE
 381 rb_reg_desc(const char *s, long len, VALUE re)
 382 {
 383     VALUE str = rb_str_buf_new2("/");
 384
 385     rb_enc_copy(str, re);
 386     rb_reg_expr_str(str, s, len);
 387     rb_str_buf_cat2(str, "/");
 388     if (re) {
 389         char opts[4];
 390         rb_reg_check(re);
 391         if (*option_to_str(opts, RREGEXP(re)->ptr->options))
 392             rb_str_buf_cat2(str, opts);
 393     }
 394     OBJ_INFECT(str, re);
 395     return str;
 396 }
 397
 398
 399 /*
 400  *  call-seq:
 401  *      rxp.source   => str
 402  *
 403  *  Returns the original string of the pattern.
 404  *
 405  *      /ab+c/ix.source #=> "ab+c"
 406  *
 407  *  Note that escape sequences are retained as is.
 408  *
 409  *     /\x20\+/.source  #=> "\\x20\\+"
 410  *
 411  */
 412
 413 static VALUE
 414 rb_reg_source(VALUE re)
 415 {
 416     VALUE str;
 417
 418     rb_reg_check(re);
 419     str = rb_enc_str_new(RREGEXP(re)->str,RREGEXP(re)->len, rb_enc_get(re));
 420     if (OBJ_TAINTED(re)) OBJ_TAINT(str);
 421     return str;
 422 }
 423
 424 /*
 425  * call-seq:
 426  *    rxp.inspect   => string
 427  *
 428  * Produce a nicely formatted string-version of _rxp_. Perhaps surprisingly,
 429  * <code>#inspect</code> actually produces the more natural version of
 430  * the string than <code>#to_s</code>.
 431  *
 432  *      /ab+c/ix.inspect        #=> "/ab+c/ix"
 433  *
 434  */
 435
 436 static VALUE
 437 rb_reg_inspect(VALUE re)
 438 {
 439     rb_reg_check(re);
 440     return rb_reg_desc(RREGEXP(re)->str, RREGEXP(re)->len, re);
 441 }
 442
 443
 444 /*
 445  *  call-seq:
 446  *     rxp.to_s   => str
 447  *
 448  *  Returns a string containing the regular expression and its options (using the
 449  *  <code>(?opts:source)</code> notation. This string can be fed back in to
 450  *  <code>Regexp::new</code> to a regular expression with the same semantics as
 451  *  the original. (However, <code>Regexp#==</code> may not return true when
 452  *  comparing the two, as the source of the regular expression itself may
 453  *  differ, as the example shows).  <code>Regexp#inspect</code> produces a
 454  *  generally more readable version of <i>rxp</i>.
 455  *
 456  *      r1 = /ab+c/ix           #=> /ab+c/ix
 457  *      s1 = r1.to_s            #=> "(?ix-m:ab+c)"
 458  *      r2 = Regexp.new(s1)     #=> /(?ix-m:ab+c)/
 459  *      r1 == r2                #=> false
 460  *      r1.source               #=> "ab+c"
 461  *      r2.source               #=> "(?ix-m:ab+c)"
 462  */
 463
 464 static VALUE
 465 rb_reg_to_s(VALUE re)
 466 {
 467     int options, opt;
 468     const int embeddable = ONIG_OPTION_MULTILINE|ONIG_OPTION_IGNORECASE|ONIG_OPTION_EXTEND;
 469     long len;
 470     const UChar* ptr;
 471     VALUE str = rb_str_buf_new2("(?");
 472     char optbuf[5];
 473
 474     rb_reg_check(re);
 475
 476     rb_enc_copy(str, re);
 477     options = RREGEXP(re)->ptr->options;
 478     ptr = (UChar*)RREGEXP(re)->str;
 479     len = RREGEXP(re)->len;
 480   again:
 481     if (len >= 4 && ptr[0] == '(' && ptr[1] == '?') {
 482         int err = 1;
 483         ptr += 2;
 484         if ((len -= 2) > 0) {
 485             do {
 486                 opt = char_to_option((int )*ptr);
 487                 if (opt != 0) {
 488                     options |= opt;
 489                 }
 490                 else {
 491                     break;
 492                 }
 493                 ++ptr;
 494             } while (--len > 0);
 495         }
 496         if (len > 1 && *ptr == '-') {
 497             ++ptr;
 498             --len;
 499             do {
 500                 opt = char_to_option((int )*ptr);
 501                 if (opt != 0) {
 502                     options &= ~opt;
 503                 }
 504                 else {
 505                     break;
 506                 }
 507                 ++ptr;
 508             } while (--len > 0);
 509         }
 510         if (*ptr == ')') {
 511             --len;
 512             ++ptr;
 513             goto again;
 514         }
 515         if (*ptr == ':' && ptr[len-1] == ')') {
 516             int r;
 517             Regexp *rp;
 518             r = onig_alloc_init(&rp, ONIG_OPTION_DEFAULT,
 519                                 ONIGENC_CASE_FOLD_DEFAULT,
 520                                 rb_enc_get(re),
 521                                 OnigDefaultSyntax);
 522             if (r == 0) {
 523                  ++ptr;
 524                  len -= 2;
 525                  err = (onig_compile(rp, ptr, ptr + len, NULL) != 0);
 526             }
 527             onig_free(rp);
 528         }
 529         if (err) {
 530             options = RREGEXP(re)->ptr->options;
 531             ptr = (UChar*)RREGEXP(re)->str;
 532             len = RREGEXP(re)->len;
 533         }
 534     }
 535
 536     if (*option_to_str(optbuf, options)) rb_str_buf_cat2(str, optbuf);
 537
 538     if ((options & embeddable) != embeddable) {
 539         optbuf[0] = '-';
 540         option_to_str(optbuf + 1, ~options);
 541         rb_str_buf_cat2(str, optbuf);
 542     }
 543
 544     rb_str_buf_cat2(str, ":");
 545     rb_reg_expr_str(str, (char*)ptr, len);
 546     rb_str_buf_cat2(str, ")");
 547     rb_enc_copy(str, re);
 548
 549     OBJ_INFECT(str, re);
 550     return str;
 551 }
 552
 553 static void
 554 rb_reg_raise(const char *s, long len, const char *err, VALUE re)
 555 {
 556     VALUE desc = rb_reg_desc(s, len, re);
 557
 558     rb_raise(rb_eRegexpError, "%s: %s", err, RSTRING_PTR(desc));
 559 }
 560
 561 static VALUE
 562 rb_enc_reg_error_desc(const char *s, long len, rb_encoding *enc, int options, const char *err)
 563 {
 564     char opts[6];
 565     VALUE desc = rb_str_buf_new2(err);
 566
 567     rb_enc_associate(desc, enc);
 568     rb_str_buf_cat2(desc, ": /");
 569     rb_reg_expr_str(desc, s, len);
 570     opts[0] = '/';
 571     option_to_str(opts + 1, options);
 572     rb_str_buf_cat2(desc, opts);
 573     return rb_exc_new3(rb_eRegexpError, desc);
 574 }
 575
 576 static void
 577 rb_enc_reg_raise(const char *s, long len, rb_encoding *enc, int options, const char *err)
 578 {
 579     rb_exc_raise(rb_enc_reg_error_desc(s, len, enc, options, err));
 580 }
 581
 582 static VALUE
 583 rb_reg_error_desc(VALUE str, int options, const char *err)
 584 {
 585     return rb_enc_reg_error_desc(RSTRING_PTR(str), RSTRING_LEN(str),
 586                                  rb_enc_get(str), options, err);
 587 }
 588
 589 static void
 590 rb_reg_raise_str(VALUE str, int options, const char *err)
 591 {
 592     rb_exc_raise(rb_reg_error_desc(str, options, err));
 593 }
 594
 595
 596 /*
 597  *  call-seq:
 598  *     rxp.casefold?   => true or false
 599  *
 600  *  Returns the value of the case-insensitive flag.
 601  *
 602  *      /a/.casefold?           #=> false
 603  *      /a/i.casefold?          #=> true
 604  *      /(?i:a)/.casefold?      #=> false
 605  */
 606
 607 static VALUE
 608 rb_reg_casefold_p(VALUE re)
 609 {
 610     rb_reg_check(re);
 611     if (RREGEXP(re)->ptr->options & ONIG_OPTION_IGNORECASE) return Qtrue;
 612     return Qfalse;
 613 }
 614
 615
 616 /*
 617  *  call-seq:
 618  *     rxp.options   => fixnum
 619  *
 620  *  Returns the set of bits corresponding to the options used when creating this
 621  *  Regexp (see <code>Regexp::new</code> for details. Note that additional bits
 622  *  may be set in the returned options: these are used internally by the regular
 623  *  expression code. These extra bits are ignored if the options are passed to
 624  *  <code>Regexp::new</code>.
 625  *
 626  *     Regexp::IGNORECASE                  #=> 1
 627  *     Regexp::EXTENDED                    #=> 2
 628  *     Regexp::MULTILINE                   #=> 4
 629  *
 630  *     /cat/.options                       #=> 0
 631  *     /cat/ix.options                     #=> 3
 632  *     Regexp.new('cat', true).options     #=> 1
 633  *     /\xa1\xa2/e.options                 #=> 16
 634  *
 635  *     r = /cat/ix
 636  *     Regexp.new(r.source, r.options)     #=> /cat/ix
 637  */
 638
 639 static VALUE
 640 rb_reg_options_m(VALUE re)
 641 {
 642     int options = rb_reg_options(re);
 643     return INT2NUM(options);
 644 }
 645
 646 static int
 647 reg_names_iter(const OnigUChar *name, const OnigUChar *name_end,
 648           int back_num, int *back_refs, OnigRegex regex, void *arg)
 649 {
 650     VALUE ary = (VALUE)arg;
 651     rb_ary_push(ary, rb_str_new((const char *)name, name_end-name));
 652     return 0;
 653 }
 654
 655 /*
 656  * call-seq:
 657  *    rxp.names   => [name1, name2, ...]
 658  *
 659  * Returns a list of names of captures as an array of strings.
 660  *
 661  *     /(?<foo>.)(?<bar>.)(?<baz>.)/.names
 662  *     #=> ["foo", "bar", "baz"]
 663  *
 664  *     /(?<foo>.)(?<foo>.)/.names
 665  *     #=> ["foo"]
 666  *
 667  *     /(.)(.)/.names
 668  *     #=> []
 669  */
 670
 671 static VALUE
 672 rb_reg_names(VALUE re)
 673 {
 674     VALUE ary = rb_ary_new();
 675     onig_foreach_name(RREGEXP(re)->ptr, reg_names_iter, (void*)ary);
 676     return ary;
 677 }
 678
 679 static int
 680 reg_named_captures_iter(const OnigUChar *name, const OnigUChar *name_end,
 681           int back_num, int *back_refs, OnigRegex regex, void *arg)
 682 {
 683     VALUE hash = (VALUE)arg;
 684     VALUE ary = rb_ary_new2(back_num);
 685     int i;
 686
 687     for(i = 0; i < back_num; i++)
 688         rb_ary_store(ary, i, INT2NUM(back_refs[i]));
 689
 690     rb_hash_aset(hash, rb_str_new((const char*)name, name_end-name),ary);
 691
 692     return 0;
 693 }
 694
 695 /*
 696  * call-seq:
 697  *    rxp.named_captures  => hash
 698  *
 699  * Returns a hash representing information about named captures of <i>rxp</i>.
 700  *
 701  * A key of the hash is a name of the named captures.
 702  * A value of the hash is an array which is list of indexes of corresponding
 703  * named captures.
 704  *
 705  *    /(?<foo>.)(?<bar>.)/.named_captures
 706  *    #=> {"foo"=>[1], "bar"=>[2]}
 707  *
 708  *    /(?<foo>.)(?<foo>.)/.named_captures
 709  *    #=> {"foo"=>[1, 2]}
 710  *
 711  * If there are no named captures, an empty hash is returned.
 712  *
 713  *    /(.)(.)/.named_captures
 714  *    #=> {}
 715  */
 716
 717 static VALUE
 718 rb_reg_named_captures(VALUE re)
 719 {
 720     VALUE hash = rb_hash_new();
 721     onig_foreach_name(RREGEXP(re)->ptr, reg_named_captures_iter, (void*)hash);
 722     return hash;
 723 }
 724
 725 static Regexp*
 726 make_regexp(const char *s, long len, rb_encoding *enc, int flags, onig_errmsg_buffer err)
 727 {
 728     Regexp *rp;
 729     int r;
 730     OnigErrorInfo einfo;
 731
 732     /* Handle escaped characters first. */
 733
 734     /* Build a copy of the string (in dest) with the
 735        escaped characters translated,  and generate the regex
 736        from that.
 737     */
 738
 739     r = onig_alloc_init(&rp, flags, ONIGENC_CASE_FOLD_DEFAULT,
 740                         enc, OnigDefaultSyntax);
 741     if (r) {
 742         onig_error_code_to_str((UChar*)err, r);
 743         return 0;
 744     }
 745
 746     r = onig_compile(rp, (UChar*)s, (UChar*)(s + len), &einfo);
 747
 748     if (r != 0) {
 749         onig_free(rp);
 750         (void )onig_error_code_to_str((UChar*)err, r, &einfo);
 751         return 0;
 752     }
 753     return rp;
 754 }
 755
 756
 757 /*
 758  *  Document-class: MatchData
 759  *
 760  *  <code>MatchData</code> is the type of the special variable <code>$~</code>,
 761  *  and is the type of the object returned by <code>Regexp#match</code> and
 762  *  <code>Regexp.last_match</code>. It encapsulates all the results of a pattern
 763  *  match, results normally accessed through the special variables
 764  *  <code>$&</code>, <code>$'</code>, <code>$`</code>, <code>$1</code>,
 765  *  <code>$2</code>, and so on.
 766  *
 767  */
 768
 769 VALUE rb_cMatch;
 770
 771 static VALUE
 772 match_alloc(VALUE klass)
 773 {
 774     NEWOBJ(match, struct RMatch);
 775     OBJSETUP(match, klass, T_MATCH);
 776
 777     match->str = 0;
 778     match->rmatch = 0;
 779     match->regexp = 0;
 780     match->rmatch = ALLOC(struct rmatch);
 781     MEMZERO(match->rmatch, struct rmatch, 1);
 782
 783     return (VALUE)match;
 784 }
 785
 786 typedef struct {
 787     int byte_pos;
 788     int char_pos;
 789 } pair_t;
 790
 791 static int
 792 pair_byte_cmp(const void *pair1, const void *pair2)
 793 {
 794     return ((pair_t*)pair1)->byte_pos - ((pair_t*)pair2)->byte_pos;
 795 }
 796
 797 static void
 798 update_char_offset(VALUE match)
 799 {
 800     struct rmatch *rm = RMATCH(match)->rmatch;
 801     struct re_registers *regs;
 802     int num_regs;
 803     int i, num_pos, c;
 804     char *s, *p, *q, *e;
 805     rb_encoding *enc;
 806     pair_t *pairs;
 807
 808     if (rm->char_offset_updated)
 809         return;
 810
 811     regs = &rm->regs;
 812     num_regs = rm->regs.num_regs;
 813
 814     if (rm->char_offset_num_allocated < num_regs) {
 815         REALLOC_N(rm->char_offset, struct rmatch_offset, num_regs);
 816         rm->char_offset_num_allocated = num_regs;
 817     }
 818
 819     enc = rb_enc_get(RMATCH(match)->str);
 820     if (rb_enc_mbmaxlen(enc) == 1) {
 821         for (i = 0; i < num_regs; i++) {
 822             rm->char_offset[i].beg = BEG(i);
 823             rm->char_offset[i].end = END(i);
 824         }
 825         rm->char_offset_updated = 1;
 826         return;
 827     }
 828
 829     pairs = ALLOCA_N(pair_t, num_regs*2);
 830     num_pos = 0;
 831     for (i = 0; i < num_regs; i++) {
 832         if (BEG(i) < 0)
 833             continue;
 834         pairs[num_pos++].byte_pos = BEG(i);
 835         pairs[num_pos++].byte_pos = END(i);
 836     }
 837     qsort(pairs, num_pos, sizeof(pair_t), pair_byte_cmp);
 838
 839     s = p = RSTRING_PTR(RMATCH(match)->str);
 840     e = s + RSTRING_LEN(RMATCH(match)->str);
 841     c = 0;
 842     for (i = 0; i < num_pos; i++) {
 843         q = s + pairs[i].byte_pos;
 844         c += rb_enc_strlen(p, q, enc);
 845         pairs[i].char_pos = c;
 846         p = q;
 847     }
 848
 849     for (i = 0; i < num_regs; i++) {
 850         pair_t key, *found;
 851         if (BEG(i) < 0) {
 852             rm->char_offset[i].beg = -1;
 853             rm->char_offset[i].end = -1;
 854             continue;
 855         }
 856
 857         key.byte_pos = BEG(i);
 858         found = bsearch(&key, pairs, num_pos, sizeof(pair_t), pair_byte_cmp);
 859         rm->char_offset[i].beg = found->char_pos;
 860
 861         key.byte_pos = END(i);
 862         found = bsearch(&key, pairs, num_pos, sizeof(pair_t), pair_byte_cmp);
 863         rm->char_offset[i].end = found->char_pos;
 864     }
 865
 866     rm->char_offset_updated = 1;
 867 }
 868
 869 /* :nodoc: */
 870 static VALUE
 871 match_init_copy(VALUE obj, VALUE orig)
 872 {
 873     struct rmatch *rm;
 874
 875     if (obj == orig) return obj;
 876
 877     if (!rb_obj_is_instance_of(orig, rb_obj_class(obj))) {
 878         rb_raise(rb_eTypeError, "wrong argument class");
 879     }
 880     RMATCH(obj)->str = RMATCH(orig)->str;
 881     RMATCH(obj)->regexp = RMATCH(orig)->regexp;
 882
 883     rm = RMATCH(obj)->rmatch;
 884     onig_region_copy(&rm->regs, RMATCH_REGS(orig));
 885
 886     if (!RMATCH(orig)->rmatch->char_offset_updated) {
 887         rm->char_offset_updated = 0;
 888     }
 889     else {
 890         if (rm->char_offset_num_allocated < rm->regs.num_regs) {
 891             REALLOC_N(rm->char_offset, struct rmatch_offset, rm->regs.num_regs);
 892             rm->char_offset_num_allocated = rm->regs.num_regs;
 893         }
 894         MEMCPY(rm->char_offset, RMATCH(orig)->rmatch->char_offset,
 895                struct rmatch_offset, rm->regs.num_regs);
 896         rm->char_offset_updated = 1;
 897     }
 898
 899     return obj;
 900 }
 901
 902
 903 /*
 904  * call-seq:
 905  *    mtch.regexp   => regexp
 906  *
 907  * Returns the regexp.
 908  *
 909  *     m = /a.*b/.match("abc")
 910  *     m.regexp #=> /a.*b/
 911  */
 912
 913 static VALUE
 914 match_regexp(VALUE match)
 915 {
 916     return RMATCH(match)->regexp;
 917 }
 918
 919 /*
 920  * call-seq:
 921  *    mtch.names   => [name1, name2, ...]
 922  *
 923  * Returns a list of names of captures as an array of strings.
 924  * It is same as mtch.regexp.names.
 925  *
 926  *     /(?<foo>.)(?<bar>.)(?<baz>.)/.match("hoge").names
 927  *     #=> ["foo", "bar", "baz"]
 928  *
 929  *     m = /(?<x>.)(?<y>.)?/.match("a") #=> #<MatchData "a" x:"a" y:nil>
 930  *     m.names                          #=> ["x", "y"]
 931  */
 932
 933 static VALUE
 934 match_names(VALUE match)
 935 {
 936     return rb_reg_names(RMATCH(match)->regexp);
 937 }
 938
 939 /*
 940  *  call-seq:
 941  *     mtch.length   => integer
 942  *     mtch.size     => integer
 943  *
 944  *  Returns the number of elements in the match array.
 945  *
 946  *     m = /(.)(.)(\d+)(\d)/.match("THX1138.")
 947  *     m.length   #=> 5
 948  *     m.size     #=> 5
 949  */
 950
 951 static VALUE
 952 match_size(VALUE match)
 953 {
 954     return INT2FIX(RMATCH_REGS(match)->num_regs);
 955 }
 956
 957 static int
 958 match_backref_number(VALUE match, VALUE backref)
 959 {
 960     const char *name;
 961     int num;
 962
 963     struct re_registers *regs = RMATCH_REGS(match);
 964     VALUE regexp = RMATCH(match)->regexp;
 965
 966     switch(TYPE(backref)) {
 967       default:
 968         return NUM2INT(backref);
 969
 970       case T_SYMBOL:
 971         name = rb_id2name(SYM2ID(backref));
 972         break;
 973
 974       case T_STRING:
 975         name = StringValueCStr(backref);
 976         break;
 977     }
 978
 979     num = onig_name_to_backref_number(RREGEXP(regexp)->ptr,
 980               (const unsigned char*)name,
 981               (const unsigned char*)name + strlen(name),
 982               regs);
 983
 984     if (num < 1) {
 985         rb_raise(rb_eIndexError, "undefined group name reference: %s", name);
 986     }
 987
 988     return num;
 989 }
 990
 991
 992 /*
 993  *  call-seq:
 994  *     mtch.offset(n)   => array
 995  *
 996  *  Returns a two-element array containing the beginning and ending offsets of
 997  *  the <em>n</em>th match.
 998  *  <em>n</em> can be a string or symbol to reference a named capture.
 999  *
1000  *     m = /(.)(.)(\d+)(\d)/.match("THX1138.")
1001  *     m.offset(0)      #=> [1, 7]
1002  *     m.offset(4)      #=> [6, 7]
1003  *
1004  *     m = /(?<foo>.)(.)(?<bar>.)/.match("hoge")
1005  *     p m.offset(:foo) #=> [0, 1]
1006  *     p m.offset(:bar) #=> [2, 3]
1007  *
1008  */
1009
1010 static VALUE
1011 match_offset(VALUE match, VALUE n)
1012 {
1013     int i = match_backref_number(match, n);
1014     struct re_registers *regs = RMATCH_REGS(match);
1015
1016     if (i < 0 || regs->num_regs <= i)
1017         rb_raise(rb_eIndexError, "index %d out of matches", i);
1018
1019     if (BEG(i) < 0)
1020         return rb_assoc_new(Qnil, Qnil);
1021
1022     update_char_offset(match);
1023     return rb_assoc_new(INT2FIX(RMATCH(match)->rmatch->char_offset[i].beg),
1024                         INT2FIX(RMATCH(match)->rmatch->char_offset[i].end));
1025 }
1026
1027
1028 /*
1029  *  call-seq:
1030  *     mtch.begin(n)   => integer
1031  *
1032  *  Returns the offset of the start of the <em>n</em>th element of the match
1033  *  array in the string.
1034  *  <em>n</em> can be a string or symbol to reference a named capture.
1035  *
1036  *     m = /(.)(.)(\d+)(\d)/.match("THX1138.")
1037  *     m.begin(0)       #=> 1
1038  *     m.begin(2)       #=> 2
1039  *
1040  *     m = /(?<foo>.)(.)(?<bar>.)/.match("hoge")
1041  *     p m.begin(:foo)  #=> 0
1042  *     p m.begin(:bar)  #=> 2
1043  */
1044
1045 static VALUE
1046 match_begin(VALUE match, VALUE n)
1047 {
1048     int i = match_backref_number(match, n);
1049     struct re_registers *regs = RMATCH_REGS(match);
1050
1051     if (i < 0 || regs->num_regs <= i)
1052         rb_raise(rb_eIndexError, "index %d out of matches", i);
1053
1054     if (BEG(i) < 0)
1055         return Qnil;
1056
1057     update_char_offset(match);
1058     return INT2FIX(RMATCH(match)->rmatch->char_offset[i].beg);
1059 }
1060
1061
1062 /*
1063  *  call-seq:
1064  *     mtch.end(n)   => integer
1065  *
1066  *  Returns the offset of the character immediately following the end of the
1067  *  <em>n</em>th element of the match array in the string.
1068  *  <em>n</em> can be a string or symbol to reference a named capture.
1069  *
1070  *     m = /(.)(.)(\d+)(\d)/.match("THX1138.")
1071  *     m.end(0)         #=> 7
1072  *     m.end(2)         #=> 3
1073  *
1074  *     m = /(?<foo>.)(.)(?<bar>.)/.match("hoge")
1075  *     p m.end(:foo)    #=> 1
1076  *     p m.end(:bar)    #=> 3
1077  */
1078
1079 static VALUE
1080 match_end(VALUE match, VALUE n)
1081 {
1082     int i = match_backref_number(match, n);
1083     struct re_registers *regs = RMATCH_REGS(match);
1084
1085     if (i < 0 || regs->num_regs <= i)
1086         rb_raise(rb_eIndexError, "index %d out of matches", i);
1087
1088     if (BEG(i) < 0)
1089         return Qnil;
1090
1091     update_char_offset(match);
1092     return INT2FIX(RMATCH(match)->rmatch->char_offset[i].end);
1093 }
1094
1095 #define MATCH_BUSY FL_USER2
1096
1097 void
1098 rb_match_busy(VALUE match)
1099 {
1100     FL_SET(match, MATCH_BUSY);
1101 }
1102
1103 /*
1104  *  call-seq:
1105  *     rxp.fixed_encoding?   => true or false
1106  *
1107  *  Returns false if rxp is applicable to
1108  *  a string with any ASCII compatible encoding.
1109  *  Returns true otherwise.
1110  *
1111  *      r = /a/
1112  *      r.fixed_encoding?                               #=> false
1113  *      r =~ "\u{6666} a"                               #=> 2
1114  *      r =~ "\xa1\xa2 a".force_encoding("euc-jp")      #=> 2
1115  *      r =~ "abc".force_encoding("euc-jp")             #=> 0
1116  *
1117  *      r = /a/u
1118  *      r.fixed_encoding?                               #=> true
1119  *      r.encoding                                      #=> #<Encoding:UTF-8>
1120  *      r =~ "\u{6666} a"                               #=> 2
1121  *      r =~ "\xa1\xa2".force_encoding("euc-jp")        #=> ArgumentError
1122  *      r =~ "abc".force_encoding("euc-jp")             #=> 0
1123  *
1124  *      r = /\u{6666}/
1125  *      r.fixed_encoding?                               #=> true
1126  *      r.encoding                                      #=> #<Encoding:UTF-8>
1127  *      r =~ "\u{6666} a"                               #=> 0
1128  *      r =~ "\xa1\xa2".force_encoding("euc-jp")        #=> ArgumentError
1129  *      r =~ "abc".force_encoding("euc-jp")             #=> nil
1130  */
1131
1132 static VALUE
1133 rb_reg_fixed_encoding_p(VALUE re)
1134 {
1135     if (FL_TEST(re, KCODE_FIXED))
1136         return Qtrue;
1137     else
1138         return Qfalse;
1139 }
1140
1141 static VALUE
1142 rb_reg_preprocess(const char *p, const char *end, rb_encoding *enc,
1143         rb_encoding **fixed_enc, onig_errmsg_buffer err);
1144
1145
1146 static void
1147 reg_enc_error(VALUE re, VALUE str)
1148 {
1149     rb_raise(rb_eArgError,
1150              "incompatible encoding regexp match (%s regexp with %s string)",
1151              rb_enc_name(RREGEXP(re)->ptr->enc),
1152              rb_enc_name(rb_enc_get(str)));
1153 }
1154
1155 static rb_encoding*
1156 rb_reg_prepare_enc(VALUE re, VALUE str, int warn)
1157 {
1158     rb_encoding *enc = 0;
1159
1160     if (rb_enc_str_coderange(str) == ENC_CODERANGE_BROKEN) {
1161         rb_raise(rb_eArgError,
1162             "broken %s string",
1163             rb_enc_name(rb_enc_get(str)));
1164     }
1165
1166     rb_reg_check(re);
1167     enc = rb_enc_get(str);
1168     if (!rb_enc_str_asciicompat_p(str)) {
1169         if (RREGEXP(re)->ptr->enc != enc) {
1170             reg_enc_error(re, str);
1171         }
1172     }
1173     else if (rb_reg_fixed_encoding_p(re)) {
1174         if (RREGEXP(re)->ptr->enc != enc &&
1175             (!rb_enc_asciicompat(RREGEXP(re)->ptr->enc) ||
1176              rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT)) {
1177             reg_enc_error(re, str);
1178         }
1179         enc = RREGEXP(re)->ptr->enc;
1180     }
1181     if (warn && (RBASIC(re)->flags & REG_ENCODING_NONE) &&
1182         enc != rb_ascii8bit_encoding() &&
1183         rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT) {
1184         rb_warn("regexp match /.../n against to %s string",
1185                 rb_enc_name(enc));
1186     }
1187     return enc;
1188 }
1189
1190 regex_t *
1191 rb_reg_prepare_re(VALUE re, VALUE str)
1192 {
1193     regex_t *reg = RREGEXP(re)->ptr;
1194     onig_errmsg_buffer err = "";
1195     int r;
1196     OnigErrorInfo einfo;
1197     const char *pattern;
1198     VALUE unescaped;
1199     rb_encoding *fixed_enc = 0;
1200     rb_encoding *enc = rb_reg_prepare_enc(re, str, 1);
1201
1202     if (reg->enc == enc) return reg;
1203
1204     rb_reg_check(re);
1205     reg = RREGEXP(re)->ptr;
1206     pattern = RREGEXP(re)->str;
1207
1208     unescaped = rb_reg_preprocess(
1209         pattern, pattern + RREGEXP(re)->len, enc,
1210         &fixed_enc, err);
1211
1212     if (unescaped == Qnil) {
1213         rb_raise(rb_eArgError, "regexp preprocess failed: %s", err);
1214     }
1215
1216     r = onig_new(&reg, (UChar* )RSTRING_PTR(unescaped),
1217                  (UChar* )(RSTRING_PTR(unescaped) + RSTRING_LEN(unescaped)),
1218                  reg->options, enc,
1219                  OnigDefaultSyntax, &einfo);
1220     if (r) {
1221         onig_error_code_to_str((UChar*)err, r, &einfo);
1222         rb_reg_raise(pattern, RREGEXP(re)->len, err, re);
1223     }
1224
1225     RB_GC_GUARD(unescaped);
1226     return reg;
1227 }
1228
1229 int
1230 rb_reg_adjust_startpos(VALUE re, VALUE str, int pos, int reverse)
1231 {
1232     int range;
1233     rb_encoding *enc;
1234     UChar *p, *string;
1235
1236     enc = rb_reg_prepare_enc(re, str, 0);
1237
1238     if (reverse) {
1239         range = -pos;
1240     }
1241     else {
1242         range = RSTRING_LEN(str) - pos;
1243     }
1244
1245     if (pos > 0 && ONIGENC_MBC_MAXLEN(enc) != 1 && pos < RSTRING_LEN(str)) {
1246          string = (UChar*)RSTRING_PTR(str);
1247
1248          if (range > 0) {
1249               p = onigenc_get_right_adjust_char_head(enc, string, string + pos);
1250          }
1251          else {
1252               p = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, string, string + pos);
1253          }
1254          return p - string;
1255     }
1256
1257     return pos;
1258 }
1259
1260 int
1261 rb_reg_search(VALUE re, VALUE str, int pos, int reverse)
1262 {
1263     int result;
1264     VALUE match;
1265     struct re_registers *regs, regi;
1266     char *range = RSTRING_PTR(str);
1267     regex_t *reg0 = RREGEXP(re)->ptr, *reg;
1268     int busy = FL_TEST(re, REG_BUSY);
1269
1270     if (pos > RSTRING_LEN(str) || pos < 0) {
1271         rb_backref_set(Qnil);
1272         return -1;
1273     }
1274
1275     reg = rb_reg_prepare_re(re, str);
1276
1277     match = rb_backref_get();
1278     if (!NIL_P(match)) {
1279         if (FL_TEST(match, MATCH_BUSY)) {
1280             match = Qnil;
1281         }
1282         else {
1283             regs = RMATCH_REGS(match);
1284         }
1285     }
1286     if (NIL_P(match)) {
1287         regs = &regi;
1288         MEMZERO(regs, struct re_registers, 1);
1289     }
1290     FL_SET(re, REG_BUSY);
1291     if (!reverse) {
1292         range += RSTRING_LEN(str);
1293     }
1294     result = onig_search(reg,
1295                          (UChar*)(RSTRING_PTR(str)),
1296                          ((UChar*)(RSTRING_PTR(str)) + RSTRING_LEN(str)),
1297                          ((UChar*)(RSTRING_PTR(str)) + pos),
1298                          ((UChar*)range),
1299                          regs, ONIG_OPTION_NONE);
1300
1301     if (RREGEXP(re)->ptr != reg) {
1302         if (busy) {
1303             onig_free(reg);
1304         }
1305         else {
1306             onig_free(reg0);
1307             RREGEXP(re)->ptr = reg;
1308         }
1309     }
1310     if (!busy) FL_UNSET(re, REG_BUSY);
1311     if (result < 0) {
1312         if (regs == &regi)
1313             onig_region_free(regs, 0);
1314         if (result == ONIG_MISMATCH) {
1315             rb_backref_set(Qnil);
1316             return result;
1317         }
1318         else {
1319             onig_errmsg_buffer err = "";
1320             onig_error_code_to_str((UChar*)err, result);
1321             rb_reg_raise(RREGEXP(re)->str, RREGEXP(re)->len, err, 0);
1322         }
1323     }
1324
1325     if (NIL_P(match)) {
1326         match = match_alloc(rb_cMatch);
1327         onig_region_copy(RMATCH_REGS(match), regs);
1328         onig_region_free(regs, 0);
1329     }
1330     else {
1331         if (rb_safe_level() >= 3)
1332             OBJ_TAINT(match);
1333         else
1334             FL_UNSET(match, FL_TAINT);
1335     }
1336
1337     RMATCH(match)->str = rb_str_new4(str);
1338     RMATCH(match)->regexp = re;
1339     RMATCH(match)->rmatch->char_offset_updated = 0;
1340     rb_backref_set(match);
1341
1342     OBJ_INFECT(match, re);
1343     OBJ_INFECT(match, str);
1344
1345     return result;
1346 }
1347
1348 VALUE
1349 rb_reg_nth_defined(int nth, VALUE match)
1350 {
1351     struct re_registers *regs;
1352     if (NIL_P(match)) return Qnil;
1353     regs = RMATCH_REGS(match);
1354     if (nth >= regs->num_regs) {
1355         return Qnil;
1356     }
1357     if (nth < 0) {
1358         nth += regs->num_regs;
1359         if (nth <= 0) return Qnil;
1360     }
1361     if (BEG(nth) == -1) return Qfalse;
1362     return Qtrue;
1363 }
1364
1365 VALUE
1366 rb_reg_nth_match(int nth, VALUE match)
1367 {
1368     VALUE str;
1369     long start, end, len;
1370     struct re_registers *regs;
1371
1372     if (NIL_P(match)) return Qnil;
1373     regs = RMATCH_REGS(match);
1374     if (nth >= regs->num_regs) {
1375         return Qnil;
1376     }
1377     if (nth < 0) {
1378         nth += regs->num_regs;
1379         if (nth <= 0) return Qnil;
1380     }
1381     start = BEG(nth);
1382     if (start == -1) return Qnil;
1383     end = END(nth);
1384     len = end - start;
1385     str = rb_str_subseq(RMATCH(match)->str, start, len);
1386     OBJ_INFECT(str, match);
1387     return str;
1388 }
1389
1390 VALUE
1391 rb_reg_last_match(VALUE match)
1392 {
1393     return rb_reg_nth_match(0, match);
1394 }
1395
1396
1397 /*
1398  *  call-seq:
1399  *     mtch.pre_match   => str
1400  *
1401  *  Returns the portion of the original string before the current match.
1402  *  Equivalent to the special variable <code>$`</code>.
1403  *
1404  *     m = /(.)(.)(\d+)(\d)/.match("THX1138.")
1405  *     m.pre_match   #=> "T"
1406  */
1407
1408 VALUE
1409 rb_reg_match_pre(VALUE match)
1410 {
1411     VALUE str;
1412     struct re_registers *regs;
1413
1414     if (NIL_P(match)) return Qnil;
1415     regs = RMATCH_REGS(match);
1416     if (BEG(0) == -1) return Qnil;
1417     str = rb_str_subseq(RMATCH(match)->str, 0, BEG(0));
1418     if (OBJ_TAINTED(match)) OBJ_TAINT(str);
1419     return str;
1420 }
1421
1422
1423 /*
1424  *  call-seq:
1425  *     mtch.post_match   => str
1426  *
1427  *  Returns the portion of the original string after the current match.
1428  *  Equivalent to the special variable <code>$'</code>.
1429  *
1430  *     m = /(.)(.)(\d+)(\d)/.match("THX1138: The Movie")
1431  *     m.post_match   #=> ": The Movie"
1432  */
1433
1434 VALUE
1435 rb_reg_match_post(VALUE match)
1436 {
1437     VALUE str;
1438     long pos;
1439     struct re_registers *regs;
1440
1441     if (NIL_P(match)) return Qnil;
1442     regs = RMATCH_REGS(match);
1443     if (BEG(0) == -1) return Qnil;
1444     str = RMATCH(match)->str;
1445     pos = END(0);
1446     str = rb_str_subseq(str, pos, RSTRING_LEN(str) - pos);
1447     if (OBJ_TAINTED(match)) OBJ_TAINT(str);
1448     return str;
1449 }
1450
1451 VALUE
1452 rb_reg_match_last(VALUE match)
1453 {
1454     int i;
1455     struct re_registers *regs;
1456
1457     if (NIL_P(match)) return Qnil;
1458     regs = RMATCH_REGS(match);
1459     if (BEG(0) == -1) return Qnil;
1460
1461     for (i=regs->num_regs-1; BEG(i) == -1 && i > 0; i--)
1462         ;
1463     if (i == 0) return Qnil;
1464     return rb_reg_nth_match(i, match);
1465 }
1466
1467 static VALUE
1468 last_match_getter(void)
1469 {
1470     return rb_reg_last_match(rb_backref_get());
1471 }
1472
1473 static VALUE
1474 prematch_getter(void)
1475 {
1476     return rb_reg_match_pre(rb_backref_get());
1477 }
1478
1479 static VALUE
1480 postmatch_getter(void)
1481 {
1482     return rb_reg_match_post(rb_backref_get());
1483 }
1484
1485 static VALUE
1486 last_paren_match_getter(void)
1487 {
1488     return rb_reg_match_last(rb_backref_get());
1489 }
1490
1491 static VALUE
1492 match_array(VALUE match, int start)
1493 {
1494     struct re_registers *regs = RMATCH_REGS(match);
1495     VALUE ary = rb_ary_new2(regs->num_regs);
1496     VALUE target = RMATCH(match)->str;
1497     int i;
1498     int taint = OBJ_TAINTED(match);
1499
1500     for (i=start; i<regs->num_regs; i++) {
1501         if (regs->beg[i] == -1) {
1502             rb_ary_push(ary, Qnil);
1503         }
1504         else {
1505             VALUE str = rb_str_subseq(target, regs->beg[i], regs->end[i]-regs->beg[i]);
1506             if (taint) OBJ_TAINT(str);
1507             rb_ary_push(ary, str);
1508         }
1509     }
1510     return ary;
1511 }
1512
1513
1514 /* [MG]:FIXME: I put parens around the /.../.match() in the first line of the
1515    second example to prevent the '*' followed by a '/' from ending the
1516    comment. */
1517
1518 /*
1519  *  call-seq:
1520  *     mtch.to_a   => anArray
1521  *
1522  *  Returns the array of matches.
1523  *
1524  *     m = /(.)(.)(\d+)(\d)/.match("THX1138.")
1525  *     m.to_a   #=> ["HX1138", "H", "X", "113", "8"]
1526  *
1527  *  Because <code>to_a</code> is called when expanding
1528  *  <code>*</code><em>variable</em>, there's a useful assignment
1529  *  shortcut for extracting matched fields. This is slightly slower than
1530  *  accessing the fields directly (as an intermediate array is
1531  *  generated).
1532  *
1533  *     all,f1,f2,f3 = *(/(.)(.)(\d+)(\d)/.match("THX1138."))
1534  *     all   #=> "HX1138"
1535  *     f1    #=> "H"
1536  *     f2    #=> "X"
1537  *     f3    #=> "113"
1538  */
1539
1540 static VALUE
1541 match_to_a(VALUE match)
1542 {
1543     return match_array(match, 0);
1544 }
1545
1546
1547 /*
1548  *  call-seq:
1549  *     mtch.captures   => array
1550  *
1551  *  Returns the array of captures; equivalent to <code>mtch.to_a[1..-1]</code>.
1552  *
1553  *     f1,f2,f3,f4 = /(.)(.)(\d+)(\d)/.match("THX1138.").captures
1554  *     f1    #=> "H"
1555  *     f2    #=> "X"
1556  *     f3    #=> "113"
1557  *     f4    #=> "8"
1558  */
1559 static VALUE
1560 match_captures(VALUE match)
1561 {
1562     return match_array(match, 1);
1563 }
1564
1565 static int
1566 name_to_backref_number(struct re_registers *regs, VALUE regexp, const char* name, const char* name_end)
1567 {
1568   int num;
1569
1570   num = onig_name_to_backref_number(RREGEXP(regexp)->ptr,
1571             (const unsigned char* )name, (const unsigned char* )name_end, regs);
1572   if (num >= 1) {
1573     return num;
1574   }
1575   else {
1576     VALUE s = rb_str_new(name, (long )(name_end - name));
1577     rb_raise(rb_eIndexError, "undefined group name reference: %s",
1578                              StringValuePtr(s));
1579   }
1580 }
1581
1582 /*
1583  *  call-seq:
1584  *     mtch[i]               => str or nil
1585  *     mtch[start, length]   => array
1586  *     mtch[range]           => array
1587  *     mtch[name]            => str or nil
1588  *
1589  *  Match Reference---<code>MatchData</code> acts as an array, and may be
1590  *  accessed using the normal array indexing techniques.  <i>mtch</i>[0] is
1591  *  equivalent to the special variable <code>$&</code>, and returns the entire
1592  *  matched string.  <i>mtch</i>[1], <i>mtch</i>[2], and so on return the values
1593  *  of the matched backreferences (portions of the pattern between parentheses).
1594  *
1595  *     m = /(.)(.)(\d+)(\d)/.match("THX1138.")
1596  *     m          #=> #<MatchData "HX1138" 1:"H" 2:"X" 3:"113" 4:"8">
1597  *     m[0]       #=> "HX1138"
1598  *     m[1, 2]    #=> ["H", "X"]
1599  *     m[1..3]    #=> ["H", "X", "113"]
1600  *     m[-3, 2]   #=> ["X", "113"]
1601  *
1602  *     m = /(?<foo>a+)b/.match("ccaaab")
1603  *     m          #=> #<MatchData "aaab" foo:"aaa">
1604  *     m["foo"]   #=> "aaa"
1605  *     m[:foo]    #=> "aaa"
1606  */
1607
1608 static VALUE
1609 match_aref(int argc, VALUE *argv, VALUE match)
1610 {
1611     VALUE idx, rest;
1612
1613     rb_scan_args(argc, argv, "11", &idx, &rest);
1614
1615     if (NIL_P(rest)) {
1616       if (FIXNUM_P(idx)) {
1617         if (FIX2INT(idx) >= 0) {
1618           return rb_reg_nth_match(FIX2INT(idx), match);
1619         }
1620       }
1621       else {
1622         const char *p;
1623         int num;
1624
1625         switch (TYPE(idx)) {
1626           case T_SYMBOL:
1627             p = rb_id2name(SYM2ID(idx));
1628             goto name_to_backref;
1629             break;
1630           case T_STRING:
1631             p = StringValuePtr(idx);
1632
1633           name_to_backref:
1634             num = name_to_backref_number(RMATCH_REGS(match),
1635                        RMATCH(match)->regexp, p, p + strlen(p));
1636             return rb_reg_nth_match(num, match);
1637             break;
1638
1639           default:
1640             break;
1641         }
1642       }
1643     }
1644
1645     return rb_ary_aref(argc, argv, match_to_a(match));
1646 }
1647
1648 static VALUE
1649 match_entry(VALUE match, long n)
1650 {
1651     return rb_reg_nth_match(n, match);
1652 }
1653
1654
1655 /*
1656  *  call-seq:
1657  *
1658  *     mtch.values_at([index]*)   => array
1659  *
1660  *  Uses each <i>index</i> to access the matching values, returning an array of
1661  *  the corresponding matches.
1662  *
1663  *     m = /(.)(.)(\d+)(\d)/.match("THX1138: The Movie")
1664  *     m.to_a               #=> ["HX1138", "H", "X", "113", "8"]
1665  *     m.values_at(0, 2, -2)   #=> ["HX1138", "X", "113"]
1666  */
1667
1668 static VALUE
1669 match_values_at(int argc, VALUE *argv, VALUE match)
1670 {
1671     struct re_registers *regs = RMATCH_REGS(match);
1672     return rb_get_values_at(match, regs->num_regs, argc, argv, match_entry);
1673 }
1674
1675
1676 /*
1677  *  call-seq:
1678  *     mtch.to_s   => str
1679  *
1680  *  Returns the entire matched string.
1681  *
1682  *     m = /(.)(.)(\d+)(\d)/.match("THX1138.")
1683  *     m.to_s   #=> "HX1138"
1684  */
1685
1686 static VALUE
1687 match_to_s(VALUE match)
1688 {
1689     VALUE str = rb_reg_last_match(match);
1690
1691     if (NIL_P(str)) str = rb_str_new(0,0);
1692     if (OBJ_TAINTED(match)) OBJ_TAINT(str);
1693     if (OBJ_TAINTED(RMATCH(match)->str)) OBJ_TAINT(str);
1694     return str;
1695 }
1696
1697
1698 /*
1699  *  call-seq:
1700  *     mtch.string   => str
1701  *
1702  *  Returns a frozen copy of the string passed in to <code>match</code>.
1703  *
1704  *     m = /(.)(.)(\d+)(\d)/.match("THX1138.")
1705  *     m.string   #=> "THX1138."
1706  */
1707
1708 static VALUE
1709 match_string(VALUE match)
1710 {
1711     return RMATCH(match)->str;  /* str is frozen */
1712 }
1713
1714 struct backref_name_tag {
1715     const UChar *name;
1716     long len;
1717 };
1718
1719 static int
1720 match_inspect_name_iter(const OnigUChar *name, const OnigUChar *name_end,
1721           int back_num, int *back_refs, OnigRegex regex, void *arg0)
1722 {
1723     struct backref_name_tag *arg = (struct backref_name_tag *)arg0;
1724     int i;
1725
1726     for (i = 0; i < back_num; i++) {
1727         arg[back_refs[i]].name = name;
1728         arg[back_refs[i]].len = name_end - name;
1729     }
1730     return 0;
1731 }
1732
1733 /*
1734  * call-seq:
1735  *    mtch.inspect   => str
1736  *
1737  * Returns a printable version of <i>mtch</i>.
1738  *
1739  *     puts /.$/.match("foo").inspect
1740  *     #=> #<MatchData "o">
1741  *
1742  *     puts /(.)(.)(.)/.match("foo").inspect
1743  *     #=> #<MatchData "foo" 1:"f" 2:"o" 3:"o">
1744  *
1745  *     puts /(.)(.)?(.)/.match("fo").inspect
1746  *     #=> #<MatchData "fo" 1:"f" 2:nil 3:"o">
1747  *
1748  *     puts /(?<foo>.)(?<bar>.)(?<baz>.)/.match("hoge").inspect
1749  *     #=> #<MatchData "hog" foo:"h" bar:"o" baz:"g">
1750  *
1751  */
1752
1753 static VALUE
1754 match_inspect(VALUE match)
1755 {
1756     char *cname = rb_obj_classname(match);
1757     VALUE str;
1758     int i;
1759     struct re_registers *regs = RMATCH_REGS(match);
1760     int num_regs = regs->num_regs;
1761     struct backref_name_tag *names;
1762     VALUE regexp = RMATCH(match)->regexp;
1763
1764     if (regexp == 0) {
1765         return rb_sprintf("#<%s:%p>", cname, (void*)match);
1766     }
1767
1768     names = ALLOCA_N(struct backref_name_tag, num_regs);
1769     MEMZERO(names, struct backref_name_tag, num_regs);
1770
1771     onig_foreach_name(RREGEXP(regexp)->ptr,
1772             match_inspect_name_iter, names);
1773
1774     str = rb_str_buf_new2("#<");
1775     rb_str_buf_cat2(str, cname);
1776
1777     for (i = 0; i < num_regs; i++) {
1778         VALUE v;
1779         rb_str_buf_cat2(str, " ");
1780         if (0 < i) {
1781             if (names[i].name)
1782                 rb_str_buf_cat(str, (const char *)names[i].name, names[i].len);
1783             else {
1784                 char buf[sizeof(i)*3+1];
1785                 snprintf(buf, sizeof(buf), "%d", i);
1786                 rb_str_buf_cat2(str, buf);
1787             }
1788             rb_str_buf_cat2(str, ":");
1789         }
1790         v = rb_reg_nth_match(i, match);
1791         if (v == Qnil)
1792             rb_str_buf_cat2(str, "nil");
1793         else
1794             rb_str_buf_append(str, rb_str_inspect(v));
1795     }
1796     rb_str_buf_cat2(str, ">");
1797
1798     return str;
1799 }
1800
1801 VALUE rb_cRegexp;
1802
1803 static int
1804 read_escaped_byte(const char **pp, const char *end, onig_errmsg_buffer err)
1805 {
1806     const char *p = *pp;
1807     int code;
1808     int meta_prefix = 0, ctrl_prefix = 0;
1809     int len;
1810     int retbyte;
1811
1812     retbyte = -1;
1813     if (p == end || *p++ != '\\') {
1814         strcpy(err, "too short escaped multibyte character");
1815         return -1;
1816     }
1817
1818 again:
1819     if (p == end) {
1820         strcpy(err, "too short escape sequence");
1821         return -1;
1822     }
1823     switch (*p++) {
1824       case '\\': code = '\\'; break;
1825       case 'n': code = '\n'; break;
1826       case 't': code = '\t'; break;
1827       case 'r': code = '\r'; break;
1828       case 'f': code = '\f'; break;
1829       case 'v': code = '\013'; break;
1830       case 'a': code = '\007'; break;
1831       case 'e': code = '\033'; break;
1832
1833       /* \OOO */
1834       case '0': case '1': case '2': case '3':
1835       case '4': case '5': case '6': case '7':
1836         p--;
1837         code = ruby_scan_oct(p, end < p+3 ? end-p : 3, &len);
1838         p += len;
1839         break;
1840
1841       case 'x': /* \xHH */
1842         code = ruby_scan_hex(p, end < p+2 ? end-p : 2, &len);
1843         if (len < 1) {
1844             strcpy(err, "invalid hex escape");
1845             return -1;
1846         }
1847         p += len;
1848         break;
1849
1850       case 'M': /* \M-X, \M-\C-X, \M-\cX */
1851         if (meta_prefix) {
1852             strcpy(err, "duplicate meta escape");
1853             return -1;
1854         }
1855         meta_prefix = 1;
1856         if (p+1 < end && *p++ == '-' && (*p & 0x80) == 0) {
1857             if (*p == '\\') {
1858                 p++;
1859                 goto again;
1860             }
1861             else {
1862                 code = *p++;
1863                 break;
1864             }
1865         }
1866         strcpy(err, "too short meta escape");
1867         return -1;
1868
1869       case 'C': /* \C-X, \C-\M-X */
1870         if (p == end || *p++ != '-') {
1871             strcpy(err, "too short control escape");
1872             return -1;
1873         }
1874       case 'c': /* \cX, \c\M-X */
1875         if (ctrl_prefix) {
1876             strcpy(err, "duplicate control escape");
1877             return -1;
1878         }
1879         ctrl_prefix = 1;
1880         if (p < end && (*p & 0x80) == 0) {
1881             if (*p == '\\') {
1882                 p++;
1883                 goto again;
1884             }
1885             else {
1886                 code = *p++;
1887                 break;
1888             }
1889         }
1890         strcpy(err, "too short control escape");
1891         return -1;
1892
1893       default:
1894         strcpy(err, "unexpected escape sequence");
1895         return -1;
1896     }
1897     if (code < 0 || 0xff < code) {
1898         strcpy(err, "invalid escape code");
1899         return -1;
1900     }
1901
1902     if (ctrl_prefix)
1903         code &= 0x1f;
1904     if (meta_prefix)
1905         code |= 0x80;
1906
1907     *pp = p;
1908     return code;
1909 }
1910
1911 static int
1912 unescape_escaped_nonascii(const char **pp, const char *end, rb_encoding *enc,
1913         VALUE buf, rb_encoding **encp, onig_errmsg_buffer err)
1914 {
1915     const char *p = *pp;
1916     int chmaxlen = rb_enc_mbmaxlen(enc);
1917     char *chbuf = ALLOCA_N(char, chmaxlen);
1918     int chlen = 0;
1919     int byte;
1920     int l;
1921
1922     memset(chbuf, 0, chmaxlen);
1923
1924     byte = read_escaped_byte(&p, end, err);
1925     if (byte == -1) {
1926         return -1;
1927     }
1928
1929     chbuf[chlen++] = byte;
1930     while (chlen < chmaxlen &&
1931            MBCLEN_NEEDMORE_P(rb_enc_precise_mbclen(chbuf, chbuf+chlen, enc))) {
1932         byte = read_escaped_byte(&p, end, err);
1933         if (byte == -1) {
1934             return -1;
1935         }
1936         chbuf[chlen++] = byte;
1937     }
1938
1939     l = rb_enc_precise_mbclen(chbuf, chbuf+chlen, enc);
1940     if (MBCLEN_INVALID_P(l)) {
1941         strcpy(err, "invalid multibyte escape");
1942         return -1;
1943     }
1944     if (1 < chlen || (chbuf[0] & 0x80)) {
1945         rb_str_buf_cat(buf, chbuf, chlen);
1946
1947         if (*encp == 0)
1948             *encp = enc;
1949         else if (*encp != enc) {
1950             strcpy(err, "escaped non ASCII character in UTF-8 regexp");
1951             return -1;
1952         }
1953     }
1954     else {
1955         char escbuf[5];
1956         snprintf(escbuf, sizeof(escbuf), "\\x%02X", chbuf[0]&0xff);
1957         rb_str_buf_cat(buf, escbuf, 4);
1958     }
1959     *pp = p;
1960     return 0;
1961 }
1962
1963 static int
1964 check_unicode_range(unsigned long code, onig_errmsg_buffer err)
1965 {
1966     if ((0xd800 <= code && code <= 0xdfff) || /* Surrogates */
1967         0x10ffff < code) {
1968         strcpy(err, "invalid Unicode range");
1969         return -1;
1970     }
1971     return 0;
1972 }
1973
1974 static int
1975 append_utf8(unsigned long uv,
1976         VALUE buf, rb_encoding **encp, onig_errmsg_buffer err)
1977 {
1978     if (check_unicode_range(uv, err) != 0)
1979         return -1;
1980     if (uv < 0x80) {
1981         char escbuf[5];
1982         snprintf(escbuf, sizeof(escbuf), "\\x%02X", (int)uv);
1983         rb_str_buf_cat(buf, escbuf, 4);
1984     }
1985     else {
1986         int len;
1987         char utf8buf[6];
1988         len = rb_uv_to_utf8(utf8buf, uv);
1989         rb_str_buf_cat(buf, utf8buf, len);
1990
1991         if (*encp == 0)
1992             *encp = rb_utf8_encoding();
1993         else if (*encp != rb_utf8_encoding()) {
1994             strcpy(err, "UTF-8 character in non UTF-8 regexp");
1995             return -1;
1996         }
1997     }
1998     return 0;
1999 }
2000
2001 static int
2002 unescape_unicode_list(const char **pp, const char *end,
2003         VALUE buf, rb_encoding **encp, onig_errmsg_buffer err)
2004 {
2005     const char *p = *pp;
2006     int has_unicode = 0;
2007     unsigned long code;
2008     int len;
2009
2010     while (p < end && ISSPACE(*p)) p++;
2011
2012     while (1) {
2013         code = ruby_scan_hex(p, end-p, &len);
2014         if (len == 0)
2015             break;
2016         if (6 < len) { /* max 10FFFF */
2017             strcpy(err, "invalid Unicode range");
2018             return -1;
2019         }
2020         p += len;
2021         if (append_utf8(code, buf, encp, err) != 0)
2022             return -1;
2023         has_unicode = 1;
2024
2025         while (p < end && ISSPACE(*p)) p++;
2026     }
2027
2028     if (has_unicode == 0) {
2029         strcpy(err, "invalid Unicode list");
2030         return -1;
2031     }
2032
2033     *pp = p;
2034
2035     return 0;
2036 }
2037
2038 static int
2039 unescape_unicode_bmp(const char **pp, const char *end,
2040         VALUE buf, rb_encoding **encp, onig_errmsg_buffer err)
2041 {
2042     const char *p = *pp;
2043     int len;
2044     unsigned long code;
2045
2046     if (end < p+4) {
2047         strcpy(err, "invalid Unicode escape");
2048         return -1;
2049     }
2050     code = ruby_scan_hex(p, 4, &len);
2051     if (len != 4) {
2052         strcpy(err, "invalid Unicode escape");
2053         return -1;
2054     }
2055     if (append_utf8(code, buf, encp, err) != 0)
2056         return -1;
2057     *pp = p + 4;
2058     return 0;
2059 }
2060
2061 static int
2062 unescape_nonascii(const char *p, const char *end, rb_encoding *enc,
2063         VALUE buf, rb_encoding **encp, onig_errmsg_buffer err)
2064 {
2065     char c;
2066     char smallbuf[2];
2067
2068     while (p < end) {
2069         int chlen = rb_enc_precise_mbclen(p, end, enc);
2070         if (!MBCLEN_CHARFOUND_P(chlen)) {
2071             strcpy(err, "invalid multibyte character");
2072             return -1;
2073         }
2074         chlen = MBCLEN_CHARFOUND_LEN(chlen);
2075         if (1 < chlen || (*p & 0x80)) {
2076             rb_str_buf_cat(buf, p, chlen);
2077             p += chlen;
2078             if (*encp == 0)
2079                 *encp = enc;
2080             else if (*encp != enc) {
2081                 strcpy(err, "non ASCII character in UTF-8 regexp");
2082                 return -1;
2083             }
2084             continue;
2085         }
2086
2087         switch (c = *p++) {
2088           case '\\':
2089             if (p == end) {
2090                 strcpy(err, "too short escape sequence");
2091                 return -1;
2092             }
2093             switch (c = *p++) {
2094               case '1': case '2': case '3':
2095               case '4': case '5': case '6': case '7': /* \O, \OO, \OOO or backref */
2096                 {
2097                     int octlen;
2098                     if (ruby_scan_oct(p-1, end-(p-1), &octlen) <= 0177) {
2099                         /* backref or 7bit octal.
2100                            no need to unescape anyway.
2101                            re-escaping may break backref */
2102                         goto escape_asis;
2103                     }
2104                 }
2105                 /* xxx: How about more than 199 subexpressions? */
2106
2107               case '0': /* \0, \0O, \0OO */
2108
2109               case 'x': /* \xHH */
2110               case 'c': /* \cX, \c\M-X */
2111               case 'C': /* \C-X, \C-\M-X */
2112               case 'M': /* \M-X, \M-\C-X, \M-\cX */
2113                 p = p-2;
2114                 if (unescape_escaped_nonascii(&p, end, enc, buf, encp, err) != 0)
2115                     return -1;
2116                 break;
2117
2118               case 'u':
2119                 if (p == end) {
2120                     strcpy(err, "too short escape sequence");
2121                     return -1;
2122                 }
2123                 if (*p == '{') {
2124                     /* \u{H HH HHH HHHH HHHHH HHHHHH ...} */
2125                     p++;
2126                     if (unescape_unicode_list(&p, end, buf, encp, err) != 0)
2127                         return -1;
2128                     if (p == end || *p++ != '}') {
2129                         strcpy(err, "invalid Unicode list");
2130                         return -1;
2131                     }
2132                     break;
2133                 }
2134                 else {
2135                     /* \uHHHH */
2136                     if (unescape_unicode_bmp(&p, end, buf, encp, err) != 0)
2137                         return -1;
2138                     break;
2139                 }
2140
2141               default: /* \n, \\, \d, \9, etc. */
2142 escape_asis:
2143                 smallbuf[0] = '\\';
2144                 smallbuf[1] = c;
2145                 rb_str_buf_cat(buf, smallbuf, 2);
2146                 break;
2147             }
2148             break;
2149
2150           default:
2151             rb_str_buf_cat(buf, &c, 1);
2152             break;
2153         }
2154     }
2155
2156     return 0;
2157 }
2158
2159 static VALUE
2160 rb_reg_preprocess(const char *p, const char *end, rb_encoding *enc,
2161         rb_encoding **fixed_enc, onig_errmsg_buffer err)
2162 {
2163     VALUE buf;
2164
2165     buf = rb_str_buf_new(0);
2166
2167     if (rb_enc_asciicompat(enc))
2168         *fixed_enc = 0;
2169     else {
2170         *fixed_enc = enc;
2171         rb_enc_associate(buf, enc);
2172     }
2173
2174     if (unescape_nonascii(p, end, enc, buf, fixed_enc, err) != 0)
2175         return Qnil;
2176
2177     if (*fixed_enc) {
2178         rb_enc_associate(buf, *fixed_enc);
2179     }
2180
2181     return buf;
2182 }
2183
2184 VALUE
2185 rb_reg_check_preprocess(VALUE str)
2186 {
2187     rb_encoding *fixed_enc = 0;
2188     onig_errmsg_buffer err = "";
2189     VALUE buf;
2190     char *p, *end;
2191     rb_encoding *enc;
2192
2193     StringValue(str);
2194     p = RSTRING_PTR(str);
2195     end = p + RSTRING_LEN(str);
2196     enc = rb_enc_get(str);
2197
2198     buf = rb_reg_preprocess(p, end, enc, &fixed_enc, err);
2199     RB_GC_GUARD(str);
2200
2201     if (buf == Qnil) {
2202         return rb_reg_error_desc(str, 0, err);
2203     }
2204     return Qnil;
2205 }
2206
2207 static VALUE
2208 rb_reg_preprocess_dregexp(VALUE ary)
2209 {
2210     rb_encoding *fixed_enc = 0;
2211     rb_encoding *regexp_enc = 0;
2212     onig_errmsg_buffer err = "";
2213     int i;
2214     VALUE result = 0;
2215     int argc = RARRAY_LEN(ary);
2216     VALUE *argv = RARRAY_PTR(ary);
2217
2218     if (argc == 0) {
2219         rb_raise(rb_eArgError, "no arguments given");
2220     }
2221
2222     for (i = 0; i < argc; i++) {
2223         VALUE str = argv[i];
2224         VALUE buf;
2225         char *p, *end;
2226         rb_encoding *src_enc;
2227
2228         StringValue(str);
2229         p = RSTRING_PTR(str);
2230         end = p + RSTRING_LEN(str);
2231         src_enc = rb_enc_get(str);
2232
2233         buf = rb_reg_preprocess(p, end, src_enc, &fixed_enc, err);
2234
2235         if (buf == Qnil)
2236             rb_raise(rb_eArgError, "%s", err);
2237
2238         if (fixed_enc != 0) {
2239             if (regexp_enc != 0 && regexp_enc != fixed_enc) {
2240                 rb_raise(rb_eArgError, "encoding mismatch in dynamic regexp : %s and %s",
2241                          rb_enc_name(regexp_enc), rb_enc_name(fixed_enc));
2242             }
2243             regexp_enc = fixed_enc;
2244         }
2245
2246         if (!result)
2247             result = rb_str_new3(str);
2248         else
2249             rb_str_buf_append(result, str);
2250     }
2251     if (regexp_enc) {
2252         rb_enc_associate(result, regexp_enc);
2253     }
2254
2255     return result;
2256 }
2257
2258 static int
2259 rb_reg_initialize(VALUE obj, const char *s, int len, rb_encoding *enc,
2260                   int options, onig_errmsg_buffer err)
2261 {
2262     struct RRegexp *re = RREGEXP(obj);
2263     VALUE unescaped;
2264     rb_encoding *fixed_enc = 0;
2265     rb_encoding *a_enc = rb_ascii8bit_encoding();
2266
2267     if (!OBJ_TAINTED(obj) && rb_safe_level() >= 4)
2268         rb_raise(rb_eSecurityError, "Insecure: can't modify regexp");
2269     rb_check_frozen(obj);
2270     if (FL_TEST(obj, REG_LITERAL))
2271         rb_raise(rb_eSecurityError, "can't modify literal regexp");
2272     if (re->ptr) onig_free(re->ptr);
2273     if (re->str) free(re->str);
2274     re->ptr = 0;
2275     re->str = 0;
2276
2277     unescaped = rb_reg_preprocess(s, s+len, enc, &fixed_enc, err);
2278     if (unescaped == Qnil)
2279         return -1;
2280
2281     if (fixed_enc) {
2282         if ((fixed_enc != enc && (options & ARG_ENCODING_FIXED)) ||
2283             (fixed_enc != a_enc && (options & ARG_ENCODING_NONE))) {
2284             strcpy(err, "incompatible character encoding");
2285             return -1;
2286         }
2287         if (fixed_enc != a_enc) {
2288             options |= ARG_ENCODING_FIXED;
2289             enc = fixed_enc;
2290         }
2291     }
2292     else if (!(options & ARG_ENCODING_FIXED)) {
2293        enc = rb_usascii_encoding();
2294     }
2295
2296     rb_enc_associate((VALUE)re, enc);
2297     if ((options & ARG_ENCODING_FIXED) || fixed_enc) {
2298         re->basic.flags |= KCODE_FIXED;
2299     }
2300     if (options & ARG_ENCODING_NONE) {
2301         re->basic.flags |= REG_ENCODING_NONE;
2302     }
2303
2304     re->ptr = make_regexp(RSTRING_PTR(unescaped), RSTRING_LEN(unescaped), enc,
2305                           options & ARG_REG_OPTION_MASK, err);
2306     if (!re->ptr) return -1;
2307     re->str = ALLOC_N(char, len+1);
2308     memcpy(re->str, s, len);
2309     re->str[len] = '\0';
2310     re->len = len;
2311     RB_GC_GUARD(unescaped);
2312     return 0;
2313 }
2314
2315 static int
2316 rb_reg_initialize_str(VALUE obj, VALUE str, int options, onig_errmsg_buffer err)
2317 {
2318     int ret;
2319     rb_encoding *enc = rb_enc_get(str);
2320     if (options & ARG_ENCODING_NONE) {
2321         rb_encoding *ascii8bit = rb_ascii8bit_encoding();
2322         if (enc != ascii8bit) {
2323             if (rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT) {
2324                 strcpy(err, "/.../n has a non escaped non ASCII character in non ASCII-8BIT script");
2325                 return -1;
2326             }
2327             enc = ascii8bit;
2328         }
2329     }
2330     ret = rb_reg_initialize(obj, RSTRING_PTR(str), RSTRING_LEN(str), enc,
2331                             options, err);
2332     RB_GC_GUARD(str);
2333     return ret;
2334 }
2335
2336 static VALUE
2337 rb_reg_s_alloc(VALUE klass)
2338 {
2339     NEWOBJ(re, struct RRegexp);
2340     OBJSETUP(re, klass, T_REGEXP);
2341
2342     re->ptr = 0;
2343     re->len = 0;
2344     re->str = 0;
2345
2346     return (VALUE)re;
2347 }
2348
2349 VALUE
2350 rb_reg_new_str(VALUE s, int options)
2351 {
2352     VALUE re = rb_reg_s_alloc(rb_cRegexp);
2353     onig_errmsg_buffer err = "";
2354
2355     if (rb_reg_initialize_str(re, s, options, err) != 0) {
2356         rb_reg_raise_str(s, options, err);
2357     }
2358
2359     return re;
2360 }
2361
2362 VALUE
2363 rb_reg_new_ary(VALUE ary, int opt)
2364 {
2365     return rb_reg_new_str(rb_reg_preprocess_dregexp(ary), opt);
2366 }
2367
2368 VALUE
2369 rb_enc_reg_new(const char *s, long len, rb_encoding *enc, int options)
2370 {
2371     VALUE re = rb_reg_s_alloc(rb_cRegexp);
2372     onig_errmsg_buffer err = "";
2373
2374     if (rb_reg_initialize(re, s, len, enc, options, err) != 0) {
2375         rb_enc_reg_raise(s, len, enc, options, err);
2376     }
2377
2378     return re;
2379 }
2380
2381 VALUE
2382 rb_reg_new(const char *s, long len, int options)
2383 {
2384     return rb_enc_reg_new(s, len, rb_ascii8bit_encoding(), options);
2385 }
2386
2387 VALUE
2388 rb_reg_compile(VALUE str, int options)
2389 {
2390     VALUE re = rb_reg_s_alloc(rb_cRegexp);
2391     onig_errmsg_buffer err = "";
2392
2393     if (!str) str = rb_str_new(0,0);
2394     if (rb_reg_initialize_str(re, str, options, err) != 0) {
2395         rb_set_errinfo(rb_reg_error_desc(str, options, err));
2396         return Qnil;
2397     }
2398     FL_SET(re, REG_LITERAL);
2399     return re;
2400 }
2401
2402 static VALUE reg_cache;
2403
2404 VALUE
2405 rb_reg_regcomp(VALUE str)
2406 {
2407     volatile VALUE save_str = str;
2408     if (reg_cache && RREGEXP(reg_cache)->len == RSTRING_LEN(str)
2409         && ENCODING_GET(reg_cache) == ENCODING_GET(str)
2410         && memcmp(RREGEXP(reg_cache)->str, RSTRING_PTR(str), RSTRING_LEN(str)) == 0)
2411         return reg_cache;
2412
2413     return reg_cache = rb_reg_new_str(save_str, 0);
2414 }
2415
2416 /*
2417  * call-seq:
2418  *   rxp.hash   => fixnum
2419  *
2420  * Produce a hash based on the text and options of this regular expression.
2421  */
2422
2423 static VALUE
2424 rb_reg_hash(VALUE re)
2425 {
2426     int hashval, len;
2427     char *p;
2428
2429     rb_reg_check(re);
2430     hashval = RREGEXP(re)->ptr->options;
2431     len = RREGEXP(re)->len;
2432     p  = RREGEXP(re)->str;
2433     while (len--) {
2434         hashval = hashval * 33 + *p++;
2435     }
2436     hashval = hashval + (hashval>>5);
2437
2438     return INT2FIX(hashval);
2439 }
2440
2441
2442 /*
2443  *  call-seq:
2444  *     rxp == other_rxp      => true or false
2445  *     rxp.eql?(other_rxp)   => true or false
2446  *
2447  *  Equality---Two regexps are equal if their patterns are identical, they have
2448  *  the same character set code, and their <code>casefold?</code> values are the
2449  *  same.
2450  *
2451  *     /abc/  == /abc/x   #=> false
2452  *     /abc/  == /abc/i   #=> false
2453  *     /abc/  == /abc/n   #=> false
2454  *     /abc/u == /abc/n   #=> false
2455  */
2456
2457 static VALUE
2458 rb_reg_equal(VALUE re1, VALUE re2)
2459 {
2460     if (re1 == re2) return Qtrue;
2461     if (TYPE(re2) != T_REGEXP) return Qfalse;
2462     rb_reg_check(re1); rb_reg_check(re2);
2463     if (FL_TEST(re1, KCODE_FIXED) != FL_TEST(re2, KCODE_FIXED)) return Qfalse;
2464     if (RREGEXP(re1)->ptr->options != RREGEXP(re2)->ptr->options) return Qfalse;
2465     if (RREGEXP(re1)->len != RREGEXP(re2)->len) return Qfalse;
2466     if (ENCODING_GET(re1) != ENCODING_GET(re2)) return Qfalse;
2467     if (memcmp(RREGEXP(re1)->str, RREGEXP(re2)->str, RREGEXP(re1)->len) == 0) {
2468         return Qtrue;
2469     }
2470     return Qfalse;
2471 }
2472
2473 static VALUE
2474 reg_operand(VALUE s, int check)
2475 {
2476     if (SYMBOL_P(s)) {
2477         return rb_sym_to_s(s);
2478     }
2479     else {
2480         VALUE tmp = rb_check_string_type(s);
2481         if (check && NIL_P(tmp)) {
2482             rb_raise(rb_eTypeError, "can't convert %s to String",
2483                      rb_obj_classname(s));
2484         }
2485         return tmp;
2486     }
2487 }
2488
2489 static long
2490 reg_match_pos(VALUE re, VALUE *strp, long pos)
2491 {
2492     VALUE str = *strp;
2493
2494     if (NIL_P(str)) {
2495         rb_backref_set(Qnil);
2496         return -1;
2497     }
2498     *strp = str = reg_operand(str, Qtrue);
2499     if (pos != 0) {
2500         if (pos < 0) {
2501             VALUE l = rb_str_length(str);
2502             pos += NUM2INT(l);
2503             if (pos < 0) {
2504                 return pos;
2505             }
2506         }
2507         pos = rb_reg_adjust_startpos(re, str, pos, 0);
2508     }
2509     return rb_reg_search(re, str, pos, 0);
2510 }
2511
2512 /*
2513  *  call-seq:
2514  *     rxp =~ str    => integer or nil
2515  *
2516  *  Match---Matches <i>rxp</i> against <i>str</i>.
2517  *
2518  *     /at/ =~ "input data"   #=> 7
2519  *     /ax/ =~ "input data"   #=> nil
2520  *
2521  *  If <code>=~</code> is used with a regexp literal with named captures,
2522  *  captured strings (or nil) is assigned to local variables named by
2523  *  the capture names.
2524  *
2525  *     /(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/ =~ "  x = y  "
2526  *     p lhs    #=> "x"
2527  *     p rhs    #=> "y"
2528  *
2529  *  If it is not matched, nil is assigned for the variables.
2530  *
2531  *     /(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/ =~ "  x = "
2532  *     p lhs    #=> nil
2533  *     p rhs    #=> nil
2534  *
2535  *  This assignment is implemented in the Ruby parser.
2536  *  So a regexp literal is required for the assignment.
2537  *  The assignment is not occur if the regexp is not a literal.
2538  *
2539  *     re = /(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/
2540  *     re =~ "  x = "
2541  *     p lhs    # undefined local variable
2542  *     p rhs    # undefined local variable
2543  *
2544  *  A regexp interpolation, <code>#{}</code>, also disables
2545  *  the assignment.
2546  *
2547  *     rhs_pat = /(?<rhs>\w+)/
2548  *     /(?<lhs>\w+)\s*=\s*#{rhs_pat}/ =~ "x = y"
2549  *     p lhs    # undefined local variable
2550  *
2551  */
2552
2553 VALUE
2554 rb_reg_match(VALUE re, VALUE str)
2555 {
2556     long pos = reg_match_pos(re, &str, 0);
2557     if (pos < 0) return Qnil;
2558     pos = rb_str_sublen(str, pos);
2559     return LONG2FIX(pos);
2560 }
2561
2562 /*
2563  *  call-seq:
2564  *     rxp === str   => true or false
2565  *
2566  *  Case Equality---Synonym for <code>Regexp#=~</code> used in case statements.
2567  *
2568  *     a = "HELLO"
2569  *     case a
2570  *     when /^[a-z]*$/; print "Lower case\n"
2571  *     when /^[A-Z]*$/; print "Upper case\n"
2572  *     else;            print "Mixed case\n"
2573  *     end
2574  *
2575  *  <em>produces:</em>
2576  *
2577  *     Upper case
2578  */
2579
2580 VALUE
2581 rb_reg_eqq(VALUE re, VALUE str)
2582 {
2583     long start;
2584
2585     str = reg_operand(str, Qfalse);
2586     if (NIL_P(str)) {
2587         rb_backref_set(Qnil);
2588         return Qfalse;
2589     }
2590     start = rb_reg_search(re, str, 0, 0);
2591     if (start < 0) {
2592         return Qfalse;
2593     }
2594     return Qtrue;
2595 }
2596
2597
2598 /*
2599  *  call-seq:
2600  *     ~ rxp   => integer or nil
2601  *
2602  *  Match---Matches <i>rxp</i> against the contents of <code>$_</code>.
2603  *  Equivalent to <code><i>rxp</i> =~ $_</code>.
2604  *
2605  *     $_ = "input data"
2606  *     ~ /at/   #=> 7
2607  */
2608
2609 VALUE
2610 rb_reg_match2(VALUE re)
2611 {
2612     long start;
2613     VALUE line = rb_lastline_get();
2614
2615     if (TYPE(line) != T_STRING) {
2616         rb_backref_set(Qnil);
2617         return Qnil;
2618     }
2619
2620     start = rb_reg_search(re, line, 0, 0);
2621     if (start < 0) {
2622         return Qnil;
2623     }
2624     start = rb_str_sublen(line, start);
2625     return LONG2FIX(start);
2626 }
2627
2628
2629 /*
2630  *  call-seq:
2631  *     rxp.match(str)       => matchdata or nil
2632  *     rxp.match(str,pos)   => matchdata or nil
2633  *
2634  *  Returns a <code>MatchData</code> object describing the match, or
2635  *  <code>nil</code> if there was no match. This is equivalent to retrieving the
2636  *  value of the special variable <code>$~</code> following a normal match.
2637  *  If the second parameter is present, it specifies the position in the string
2638  *  to begin the search.
2639  *
2640  *     /(.)(.)(.)/.match("abc")[2]   #=> "b"
2641  *     /(.)(.)/.match("abc", 1)[2]   #=> "c"
2642  *
2643  *  If a block is given, invoke the block with MatchData if match succeed, so
2644  *  that you can write
2645  *
2646  *     pat.match(str) {|m| ...}
2647  *
2648  *  instead of
2649  *
2650  *     if m = pat.match(str)
2651  *       ...
2652  *     end
2653  *
2654  *  The return value is a value from block execution in this case.
2655  */
2656
2657 static VALUE
2658 rb_reg_match_m(int argc, VALUE *argv, VALUE re)
2659 {
2660     VALUE result, str, initpos;
2661     long pos;
2662
2663     if (rb_scan_args(argc, argv, "11", &str, &initpos) == 2) {
2664         pos = NUM2LONG(initpos);
2665     }
2666     else {
2667         pos = 0;
2668     }
2669
2670     pos = reg_match_pos(re, &str, pos);
2671     if (pos < 0) {
2672         rb_backref_set(Qnil);
2673         return Qnil;
2674     }
2675     result = rb_backref_get();
2676     rb_match_busy(result);
2677     if (!NIL_P(result) && rb_block_given_p()) {
2678         return rb_yield(result);
2679     }
2680     return result;
2681 }
2682
2683 /*
2684  * Document-method: compile
2685  *
2686  * Synonym for <code>Regexp.new</code>
2687  */
2688
2689
2690 /*
2691  *  call-seq:
2692  *     Regexp.new(string [, options])                => regexp
2693  *     Regexp.new(regexp)                            => regexp
2694  *     Regexp.compile(string [, options])            => regexp
2695  *     Regexp.compile(regexp)                        => regexp
2696  *
2697  *  Constructs a new regular expression from <i>pattern</i>, which can be either
2698  *  a <code>String</code> or a <code>Regexp</code> (in which case that regexp's
2699  *  options are propagated, and new options may not be specified (a change as of
2700  *  Ruby 1.8). If <i>options</i> is a <code>Fixnum</code>, it should be one or
2701  *  more of the constants <code>Regexp::EXTENDED</code>,
2702  *  <code>Regexp::IGNORECASE</code>, and <code>Regexp::MULTILINE</code>,
2703  *  <em>or</em>-ed together. Otherwise, if <i>options</i> is not
2704  *  <code>nil</code>, the regexp will be case insensitive.
2705  *
2706  *     r1 = Regexp.new('^a-z+:\\s+\w+')           #=> /^a-z+:\s+\w+/
2707  *     r2 = Regexp.new('cat', true)               #=> /cat/i
2708  *     r3 = Regexp.new('dog', Regexp::EXTENDED)   #=> /dog/x
2709  *     r4 = Regexp.new(r2)                        #=> /cat/i
2710  */
2711
2712 static VALUE
2713 rb_reg_initialize_m(int argc, VALUE *argv, VALUE self)
2714 {
2715     onig_errmsg_buffer err = "";
2716     int flags = 0;
2717     VALUE str;
2718     rb_encoding *enc;
2719     const char *ptr;
2720     long len;
2721
2722     if (argc == 0 || argc > 3) {
2723         rb_raise(rb_eArgError, "wrong number of arguments");
2724     }
2725     if (TYPE(argv[0]) == T_REGEXP) {
2726         VALUE re = argv[0];
2727
2728         if (argc > 1) {
2729             rb_warn("flags ignored");
2730         }
2731         rb_reg_check(re);
2732         flags = rb_reg_options(re);
2733         ptr = RREGEXP(re)->str;
2734         len = RREGEXP(re)->len;
2735         enc = rb_enc_get(re);
2736         if (rb_reg_initialize(self, ptr, len, enc, flags, err)) {
2737             str = rb_enc_str_new(ptr, len, enc);
2738             rb_reg_raise_str(str, flags, err);
2739         }
2740     }
2741     else {
2742         if (argc >= 2) {
2743             if (FIXNUM_P(argv[1])) flags = FIX2INT(argv[1]);
2744             else if (RTEST(argv[1])) flags = ONIG_OPTION_IGNORECASE;
2745         }
2746         enc = 0;
2747         if (argc == 3 && !NIL_P(argv[2])) {
2748             char *kcode = StringValuePtr(argv[2]);
2749             if (kcode[0] == 'n' || kcode[1] == 'N') {
2750                 enc = rb_ascii8bit_encoding();
2751                 flags |= ARG_ENCODING_FIXED;
2752             }
2753             else {
2754                 rb_warning("encoding option is obsolete - %s", kcode);
2755             }
2756         }
2757         str = argv[0];
2758         ptr = StringValuePtr(str);
2759         if (enc
2760             ? rb_reg_initialize(self, ptr, RSTRING_LEN(str), enc, flags, err)
2761             : rb_reg_initialize_str(self, str, flags, err)) {
2762             rb_reg_raise_str(str, flags, err);
2763         }
2764     }
2765     return self;
2766 }
2767
2768 VALUE
2769 rb_reg_quote(VALUE str)
2770 {
2771     rb_encoding *enc = rb_enc_get(str);
2772     char *s, *send, *t;
2773     VALUE tmp;
2774     int c, clen;
2775     int ascii_only = rb_enc_str_asciionly_p(str);
2776
2777     s = RSTRING_PTR(str);
2778     send = s + RSTRING_LEN(str);
2779     while (s < send) {
2780         c = rb_enc_ascget(s, send, &clen, enc);
2781         if (c == -1) {
2782             s += mbclen(s, send, enc);
2783             continue;
2784         }
2785         switch (c) {
2786           case '[': case ']': case '{': case '}':
2787           case '(': case ')': case '|': case '-':
2788           case '*': case '.': case '\\':
2789           case '?': case '+': case '^': case '$':
2790           case ' ': case '#':
2791           case '\t': case '\f': case '\v': case '\n': case '\r':
2792             goto meta_found;
2793         }
2794         s += clen;
2795     }
2796     tmp = rb_str_new3(str);
2797     if (ascii_only) {
2798         rb_enc_associate(tmp, rb_usascii_encoding());
2799     }
2800     return tmp;
2801
2802   meta_found:
2803     tmp = rb_str_new(0, RSTRING_LEN(str)*2);
2804     if (ascii_only) {
2805         rb_enc_associate(tmp, rb_usascii_encoding());
2806     }
2807     else {
2808         rb_enc_copy(tmp, str);
2809     }
2810     t = RSTRING_PTR(tmp);
2811     /* copy upto metacharacter */
2812     memcpy(t, RSTRING_PTR(str), s - RSTRING_PTR(str));
2813     t += s - RSTRING_PTR(str);
2814
2815     while (s < send) {
2816         c = rb_enc_ascget(s, send, &clen, enc);
2817         if (c == -1) {
2818             int n = mbclen(s, send, enc);
2819
2820             while (n--)
2821                 *t++ = *s++;
2822             continue;
2823         }
2824         s += clen;
2825         switch (c) {
2826           case '[': case ']': case '{': case '}':
2827           case '(': case ')': case '|': case '-':
2828           case '*': case '.': case '\\':
2829           case '?': case '+': case '^': case '$':
2830           case '#':
2831             *t++ = '\\';
2832             break;
2833           case ' ':
2834             *t++ = '\\';
2835             *t++ = ' ';
2836             continue;
2837           case '\t':
2838             *t++ = '\\';
2839             *t++ = 't';
2840             continue;
2841           case '\n':
2842             *t++ = '\\';
2843             *t++ = 'n';
2844             continue;
2845           case '\r':
2846             *t++ = '\\';
2847             *t++ = 'r';
2848             continue;
2849           case '\f':
2850             *t++ = '\\';
2851             *t++ = 'f';
2852             continue;
2853           case '\v':
2854             *t++ = '\\';
2855             *t++ = 'v';
2856             continue;
2857         }
2858         *t++ = c;
2859     }
2860     rb_str_resize(tmp, t - RSTRING_PTR(tmp));
2861     OBJ_INFECT(tmp, str);
2862     return tmp;
2863 }
2864
2865
2866 /*
2867  *  call-seq:
2868  *     Regexp.escape(str)   => string
2869  *     Regexp.quote(str)    => string
2870  *
2871  *  Escapes any characters that would have special meaning in a regular
2872  *  expression. Returns a new escaped string, or self if no characters are
2873  *  escaped.  For any string,
2874  *  <code>Regexp.new(Regexp.escape(<i>str</i>))=~<i>str</i></code> will be true.
2875  *
2876  *     Regexp.escape('\*?{}.')   #=> \\\*\?\{\}\.
2877  *
2878  */
2879
2880 static VALUE
2881 rb_reg_s_quote(VALUE c, VALUE str)
2882 {
2883     return rb_reg_quote(reg_operand(str, Qtrue));
2884 }
2885
2886 int
2887 rb_reg_options(VALUE re)
2888 {
2889     int options;
2890
2891     rb_reg_check(re);
2892     options = RREGEXP(re)->ptr->options & ARG_REG_OPTION_MASK;
2893     if (RBASIC(re)->flags & KCODE_FIXED) options |= ARG_ENCODING_FIXED;
2894     if (RBASIC(re)->flags & REG_ENCODING_NONE) options |= ARG_ENCODING_NONE;
2895     return options;
2896 }
2897
2898 VALUE
2899 rb_check_regexp_type(VALUE re)
2900 {
2901     return rb_check_convert_type(re, T_REGEXP, "Regexp", "to_regexp");
2902 }
2903
2904 /*
2905  *  call-seq:
2906  *     Regexp.try_convert(obj) -> re or nil
2907  *
2908  *  Try to convert <i>obj</i> into a Regexp, using to_regexp method.
2909  *  Returns converted regexp or nil if <i>obj</i> cannot be converted
2910  *  for any reason.
2911  *
2912  *     Regexp.try_convert(/re/)         #=> /re/
2913  *     Regexp.try_convert("re")         #=> nil
2914  *
2915  *     o = Object.new
2916  *     Regexp.try_convert(o)            #=> nil
2917  *     def o.to_regexp() /foo/ end
2918  *     Regexp.try_convert(o)            #=> /foo/
2919  *
2920  */
2921 static VALUE
2922 rb_reg_s_try_convert(VALUE dummy, VALUE re)
2923 {
2924     return rb_check_regexp_type(re);
2925 }
2926
2927 static VALUE
2928 rb_reg_s_union(VALUE self, VALUE args0)
2929 {
2930     long argc = RARRAY_LEN(args0);
2931
2932     if (argc == 0) {
2933         VALUE args[1];
2934         args[0] = rb_str_new2("(?!)");
2935         return rb_class_new_instance(1, args, rb_cRegexp);
2936     }
2937     else if (argc == 1) {
2938         VALUE arg = rb_ary_entry(args0, 0);
2939         VALUE re = rb_check_regexp_type(arg);
2940         if (!NIL_P(re))
2941             return re;
2942         else {
2943             VALUE quoted;
2944             quoted = rb_reg_s_quote(Qnil, arg);
2945             return rb_reg_new_str(quoted, 0);
2946         }
2947     }
2948     else {
2949         int i;
2950         VALUE source = rb_str_buf_new(0);
2951         rb_encoding *result_enc;
2952
2953         int has_asciionly = 0;
2954         rb_encoding *has_ascii_compat_fixed = 0;
2955         rb_encoding *has_ascii_incompat = 0;
2956
2957         for (i = 0; i < argc; i++) {
2958             volatile VALUE v;
2959             VALUE e = rb_ary_entry(args0, i);
2960
2961             if (0 < i)
2962                 rb_str_buf_cat_ascii(source, "|");
2963
2964             v = rb_check_regexp_type(e);
2965             if (!NIL_P(v)) {
2966                 rb_encoding *enc = rb_enc_get(v);
2967                 if (!rb_enc_asciicompat(enc)) {
2968                     if (!has_ascii_incompat)
2969                         has_ascii_incompat = enc;
2970                     else if (has_ascii_incompat != enc)
2971                         rb_raise(rb_eArgError, "incompatible encodings: %s and %s",
2972                             rb_enc_name(has_ascii_incompat), rb_enc_name(enc));
2973                 }
2974                 else if (rb_reg_fixed_encoding_p(v)) {
2975                     if (!has_ascii_compat_fixed)
2976                         has_ascii_compat_fixed = enc;
2977                     else if (has_ascii_compat_fixed != enc)
2978                         rb_raise(rb_eArgError, "incompatible encodings: %s and %s",
2979                             rb_enc_name(has_ascii_compat_fixed), rb_enc_name(enc));
2980                 }
2981                 else {
2982                     has_asciionly = 1;
2983                 }
2984                 v = rb_reg_to_s(v);
2985             }
2986             else {
2987                 rb_encoding *enc = rb_enc_get(e);
2988                 StringValue(e);
2989                 enc = rb_enc_get(e);
2990                 if (!rb_enc_str_asciicompat_p(e)) {
2991                     if (!has_ascii_incompat)
2992                         has_ascii_incompat = enc;
2993                     else if (has_ascii_incompat != enc)
2994                         rb_raise(rb_eArgError, "incompatible encodings: %s and %s",
2995                             rb_enc_name(has_ascii_incompat), rb_enc_name(enc));
2996                 }
2997                 else if (rb_enc_str_asciionly_p(e)) {
2998                     has_asciionly = 1;
2999                 }
3000                 else {
3001                     if (!has_ascii_compat_fixed)
3002                         has_ascii_compat_fixed = enc;
3003                     else if (has_ascii_compat_fixed != enc)
3004                         rb_raise(rb_eArgError, "incompatible encodings: %s and %s",
3005                             rb_enc_name(has_ascii_compat_fixed), rb_enc_name(enc));
3006                 }
3007                 v = rb_reg_s_quote(Qnil, e);
3008             }
3009             if (has_ascii_incompat) {
3010                 if (has_asciionly) {
3011                     rb_raise(rb_eArgError, "ASCII incompatible encoding: %s",
3012                         rb_enc_name(has_ascii_incompat));
3013                 }
3014                 if (has_ascii_compat_fixed) {
3015                     rb_raise(rb_eArgError, "incompatible encodings: %s and %s",
3016                         rb_enc_name(has_ascii_incompat), rb_enc_name(has_ascii_compat_fixed));
3017                 }
3018             }
3019
3020             if (i == 0) {
3021                 rb_enc_copy(source, v);
3022             }
3023             rb_str_append(source, v);
3024         }
3025
3026         if (has_ascii_incompat) {
3027             result_enc = has_ascii_incompat;
3028         }
3029         else if (has_ascii_compat_fixed) {
3030             result_enc = has_ascii_compat_fixed;
3031         }
3032         else {
3033             result_enc = rb_ascii8bit_encoding();
3034         }
3035
3036         rb_enc_associate(source, result_enc);
3037         return rb_class_new_instance(1, &source, rb_cRegexp);
3038     }
3039 }
3040
3041 /*
3042  *  call-seq:
3043  *     Regexp.union(pat1, pat2, ...)            => new_regexp
3044  *     Regexp.union(pats_ary)                   => new_regexp
3045  *
3046  *  Return a <code>Regexp</code> object that is the union of the given
3047  *  <em>pattern</em>s, i.e., will match any of its parts. The <em>pattern</em>s
3048  *  can be Regexp objects, in which case their options will be preserved, or
3049  *  Strings. If no patterns are given, returns <code>/(?!)/</code>.
3050  *
3051  *     Regexp.union                         #=> /(?!)/
3052  *     Regexp.union("penzance")             #=> /penzance/
3053  *     Regexp.union("a+b*c")                #=> /a\+b\*c/
3054  *     Regexp.union("skiing", "sledding")   #=> /skiing|sledding/
3055  *     Regexp.union(["skiing", "sledding"]) #=> /skiing|sledding/
3056  *     Regexp.union(/dogs/, /cats/i)        #=> /(?-mix:dogs)|(?i-mx:cats)/
3057  */
3058 static VALUE
3059 rb_reg_s_union_m(VALUE self, VALUE args)
3060 {
3061     VALUE v;
3062     if (RARRAY_LEN(args) == 1 &&
3063         !NIL_P(v = rb_check_array_type(rb_ary_entry(args, 0)))) {
3064         return rb_reg_s_union(self, v);
3065     }
3066     return rb_reg_s_union(self, args);
3067 }
3068
3069 /* :nodoc: */
3070 static VALUE
3071 rb_reg_init_copy(VALUE copy, VALUE re)
3072 {
3073     onig_errmsg_buffer err = "";
3074     const char *s;
3075     long len;
3076
3077     if (copy == re) return copy;
3078     rb_check_frozen(copy);
3079     /* need better argument type check */
3080     if (!rb_obj_is_instance_of(re, rb_obj_class(copy))) {
3081         rb_raise(rb_eTypeError, "wrong argument type");
3082     }
3083     rb_reg_check(re);
3084     s = RREGEXP(re)->str;
3085     len = RREGEXP(re)->len;
3086     if (rb_reg_initialize(copy, s, len, rb_enc_get(re), rb_reg_options(re), err) != 0) {
3087         rb_reg_raise(s, len, err, re);
3088     }
3089     return copy;
3090 }
3091
3092 VALUE
3093 rb_reg_regsub(VALUE str, VALUE src, struct re_registers *regs, VALUE regexp)
3094 {
3095     VALUE val = 0;
3096     char *p, *s, *e;
3097     int no, clen;
3098     rb_encoding *str_enc = rb_enc_get(str);
3099     rb_encoding *src_enc = rb_enc_get(src);
3100     int acompat = rb_enc_asciicompat(str_enc);
3101 #define ASCGET(s,e,cl) (acompat ? (*cl=1,s[0]) : rb_enc_ascget(s, e, cl, str_enc))
3102
3103     p = s = RSTRING_PTR(str);
3104     e = s + RSTRING_LEN(str);
3105
3106     while (s < e) {
3107         int c = ASCGET(s, e, &clen);
3108         char *ss;
3109
3110         if (c == -1) {
3111             s += mbclen(s, e, str_enc);
3112             continue;
3113         }
3114         ss = s;
3115         s += clen;
3116
3117         if (c != '\\' || s == e) continue;
3118
3119         if (!val) {
3120             val = rb_str_buf_new(ss-p);
3121         }
3122         rb_enc_str_buf_cat(val, p, ss-p, str_enc);
3123
3124         c = ASCGET(s, e, &clen);
3125         if (c == -1) {
3126             s += mbclen(s, e, str_enc);
3127             rb_enc_str_buf_cat(val, ss, s-ss, str_enc);
3128             p = s;
3129             continue;
3130         }
3131         s += clen;
3132
3133         p = s;
3134         switch (c) {
3135           case '1': case '2': case '3': case '4':
3136           case '5': case '6': case '7': case '8': case '9':
3137             if (onig_noname_group_capture_is_active(RREGEXP(regexp)->ptr)) {
3138                 no = c - '0';
3139             }
3140             else {
3141                 continue;
3142             }
3143             break;
3144
3145           case 'k':
3146             if (s < e && ASCGET(s, e, &clen) == '<') {
3147                 char *name, *name_end;
3148
3149                 name_end = name = s + clen;
3150                 while (name_end < e) {
3151                     c = ASCGET(name_end, e, &clen);
3152                     if (c == '>') break;
3153                     name_end += c == -1 ? mbclen(name_end, e, str_enc) : clen;
3154                 }
3155                 if (name_end < e) {
3156                     no = name_to_backref_number(regs, regexp, name, name_end);
3157                     p = s = name_end + clen;
3158                     break;
3159                 }
3160                 else {
3161                     rb_raise(rb_eRuntimeError, "invalid group name reference format");
3162                 }
3163             }
3164
3165             rb_enc_str_buf_cat(val, ss, s-ss, str_enc);
3166             continue;
3167
3168           case '0':
3169           case '&':
3170             no = 0;
3171             break;
3172
3173           case '`':
3174             rb_enc_str_buf_cat(val, RSTRING_PTR(src), BEG(0), src_enc);
3175             continue;
3176
3177           case '\'':
3178             rb_enc_str_buf_cat(val, RSTRING_PTR(src)+END(0), RSTRING_LEN(src)-END(0), src_enc);
3179             continue;
3180
3181           case '+':
3182             no = regs->num_regs-1;
3183             while (BEG(no) == -1 && no > 0) no--;
3184             if (no == 0) continue;
3185             break;
3186
3187           case '\\':
3188             rb_enc_str_buf_cat(val, s-clen, clen, str_enc);
3189             continue;
3190
3191           default:
3192             rb_enc_str_buf_cat(val, ss, s-ss, str_enc);
3193             continue;
3194         }
3195
3196         if (no >= 0) {
3197             if (no >= regs->num_regs) continue;
3198             if (BEG(no) == -1) continue;
3199             rb_enc_str_buf_cat(val, RSTRING_PTR(src)+BEG(no), END(no)-BEG(no), src_enc);
3200         }
3201     }
3202
3203     if (!val) return str;
3204     if (p < e) {
3205         rb_enc_str_buf_cat(val, p, e-p, str_enc);
3206     }
3207
3208     return val;
3209 }
3210
3211 static VALUE
3212 kcode_getter(void)
3213 {
3214     rb_warn("variable $KCODE is no longer effective");
3215     return Qnil;
3216 }
3217
3218 static void
3219 kcode_setter(VALUE val, ID id)
3220 {
3221     rb_warn("variable $KCODE is no longer effective; ignored");
3222 }
3223
3224 static VALUE
3225 ignorecase_getter(void)
3226 {
3227     rb_warn("variable $= is no longer effective");
3228     return Qfalse;
3229 }
3230
3231 static void
3232 ignorecase_setter(VALUE val, ID id)
3233 {
3234     rb_warn("variable $= is no longer effective; ignored");
3235 }
3236
3237 static VALUE
3238 match_getter(void)
3239 {
3240     VALUE match = rb_backref_get();
3241
3242     if (NIL_P(match)) return Qnil;
3243     rb_match_busy(match);
3244     return match;
3245 }
3246
3247 static void
3248 match_setter(VALUE val)
3249 {
3250     if (!NIL_P(val)) {
3251         Check_Type(val, T_MATCH);
3252     }
3253     rb_backref_set(val);
3254 }
3255
3256 /*
3257  *  call-seq:
3258  *     Regexp.last_match           => matchdata
3259  *     Regexp.last_match(n)        => str
3260  *
3261  *  The first form returns the <code>MatchData</code> object generated by the
3262  *  last successful pattern match. Equivalent to reading the global variable
3263  *  <code>$~</code>. The second form returns the <i>n</i>th field in this
3264  *  <code>MatchData</code> object.
3265  *  <em>n</em> can be a string or symbol to reference a named capture.
3266  *
3267  *     /c(.)t/ =~ 'cat'        #=> 0
3268  *     Regexp.last_match       #=> #<MatchData "cat" 1:"a">
3269  *     Regexp.last_match(0)    #=> "cat"
3270  *     Regexp.last_match(1)    #=> "a"
3271  *     Regexp.last_match(2)    #=> nil
3272  *
3273  *     /(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/ =~ "var = val"
3274  *     Regexp.last_match       #=> #<MatchData "var = val" lhs:"var" rhs:"val">
3275  *     Regexp.last_match(:lhs) #=> "var"
3276  *     Regexp.last_match(:rhs) #=> "val"
3277  */
3278
3279 static VALUE
3280 rb_reg_s_last_match(int argc, VALUE *argv)
3281 {
3282     VALUE nth;
3283
3284     if (argc > 0 && rb_scan_args(argc, argv, "01", &nth) == 1) {
3285         VALUE match = rb_backref_get();
3286         int n;
3287         if (NIL_P(match)) return Qnil;
3288         n = match_backref_number(match, nth);
3289         return rb_reg_nth_match(n, match);
3290     }
3291     return match_getter();
3292 }
3293
3294 static void
3295 re_warn(const char *s)
3296 {
3297     rb_warn("%s", s);
3298 }
3299
3300 /*
3301  *  Document-class: Regexp
3302  *
3303  *  A <code>Regexp</code> holds a regular expression, used to match a pattern
3304  *  against strings. Regexps are created using the <code>/.../</code> and
3305  *  <code>%r{...}</code> literals, and by the <code>Regexp::new</code>
3306  *  constructor.
3307  *
3308  */
3309
3310 void
3311 Init_Regexp(void)
3312 {
3313     rb_eRegexpError = rb_define_class("RegexpError", rb_eStandardError);
3314
3315     onigenc_set_default_caseconv_table((UChar*)casetable);
3316     onigenc_set_default_encoding(ONIG_ENCODING_ASCII);
3317     onig_set_warn_func(re_warn);
3318     onig_set_verb_warn_func(re_warn);
3319
3320     rb_define_virtual_variable("$~", match_getter, match_setter);
3321     rb_define_virtual_variable("$&", last_match_getter, 0);
3322     rb_define_virtual_variable("$`", prematch_getter, 0);
3323     rb_define_virtual_variable("$'", postmatch_getter, 0);
3324     rb_define_virtual_variable("$+", last_paren_match_getter, 0);
3325
3326     rb_define_virtual_variable("$=", ignorecase_getter, ignorecase_setter);
3327     rb_define_virtual_variable("$KCODE", kcode_getter, kcode_setter);
3328     rb_define_virtual_variable("$-K", kcode_getter, kcode_setter);
3329
3330     rb_cRegexp = rb_define_class("Regexp", rb_cObject);
3331     rb_define_alloc_func(rb_cRegexp, rb_reg_s_alloc);
3332     rb_define_singleton_method(rb_cRegexp, "compile", rb_class_new_instance, -1);
3333     rb_define_singleton_method(rb_cRegexp, "quote", rb_reg_s_quote, 1);
3334     rb_define_singleton_method(rb_cRegexp, "escape", rb_reg_s_quote, 1);
3335     rb_define_singleton_method(rb_cRegexp, "union", rb_reg_s_union_m, -2);
3336     rb_define_singleton_method(rb_cRegexp, "last_match", rb_reg_s_last_match, -1);
3337     rb_define_singleton_method(rb_cRegexp, "try_convert", rb_reg_s_try_convert, 1);
3338
3339     rb_define_method(rb_cRegexp, "initialize", rb_reg_initialize_m, -1);
3340     rb_define_method(rb_cRegexp, "initialize_copy", rb_reg_init_copy, 1);
3341     rb_define_method(rb_cRegexp, "hash", rb_reg_hash, 0);
3342     rb_define_method(rb_cRegexp, "eql?", rb_reg_equal, 1);
3343     rb_define_method(rb_cRegexp, "==", rb_reg_equal, 1);
3344     rb_define_method(rb_cRegexp, "=~", rb_reg_match, 1);
3345     rb_define_method(rb_cRegexp, "===", rb_reg_eqq, 1);
3346     rb_define_method(rb_cRegexp, "~", rb_reg_match2, 0);
3347     rb_define_method(rb_cRegexp, "match", rb_reg_match_m, -1);
3348     rb_define_method(rb_cRegexp, "to_s", rb_reg_to_s, 0);
3349     rb_define_method(rb_cRegexp, "inspect", rb_reg_inspect, 0);
3350     rb_define_method(rb_cRegexp, "source", rb_reg_source, 0);
3351     rb_define_method(rb_cRegexp, "casefold?", rb_reg_casefold_p, 0);
3352     rb_define_method(rb_cRegexp, "options", rb_reg_options_m, 0);
3353     rb_define_method(rb_cRegexp, "encoding", rb_obj_encoding, 0); /* in encoding.c */
3354     rb_define_method(rb_cRegexp, "fixed_encoding?", rb_reg_fixed_encoding_p, 0);
3355     rb_define_method(rb_cRegexp, "names", rb_reg_names, 0);
3356     rb_define_method(rb_cRegexp, "named_captures", rb_reg_named_captures, 0);
3357
3358     rb_define_const(rb_cRegexp, "IGNORECASE", INT2FIX(ONIG_OPTION_IGNORECASE));
3359     rb_define_const(rb_cRegexp, "EXTENDED", INT2FIX(ONIG_OPTION_EXTEND));
3360     rb_define_const(rb_cRegexp, "MULTILINE", INT2FIX(ONIG_OPTION_MULTILINE));
3361
3362     rb_global_variable(&reg_cache);
3363
3364     rb_cMatch  = rb_define_class("MatchData", rb_cObject);
3365     rb_define_alloc_func(rb_cMatch, match_alloc);
3366     rb_undef_method(CLASS_OF(rb_cMatch), "new");
3367
3368     rb_define_method(rb_cMatch, "initialize_copy", match_init_copy, 1);
3369     rb_define_method(rb_cMatch, "regexp", match_regexp, 0);
3370     rb_define_method(rb_cMatch, "names", match_names, 0);
3371     rb_define_method(rb_cMatch, "size", match_size, 0);
3372     rb_define_method(rb_cMatch, "length", match_size, 0);
3373     rb_define_method(rb_cMatch, "offset", match_offset, 1);
3374     rb_define_method(rb_cMatch, "begin", match_begin, 1);
3375     rb_define_method(rb_cMatch, "end", match_end, 1);
3376     rb_define_method(rb_cMatch, "to_a", match_to_a, 0);
3377     rb_define_method(rb_cMatch, "[]", match_aref, -1);
3378     rb_define_method(rb_cMatch, "captures", match_captures, 0);
3379     rb_define_method(rb_cMatch, "values_at", match_values_at, -1);
3380     rb_define_method(rb_cMatch, "pre_match", rb_reg_match_pre, 0);
3381     rb_define_method(rb_cMatch, "post_match", rb_reg_match_post, 0);
3382     rb_define_method(rb_cMatch, "to_s", match_to_s, 0);
3383     rb_define_method(rb_cMatch, "inspect", match_inspect, 0);
3384     rb_define_method(rb_cMatch, "string", match_string, 0);
3385 }