re.c

   1 /**********************************************************************
   2
   3   re.c -
   4
   5   $Author$
   6   created at: Mon Aug  9 18:24:49 JST 1993
   7
   8   Copyright (C) 1993-2007 Yukihiro Matsumoto
   9
  10 **********************************************************************/
  11
  12 #include "ruby/ruby.h"
  13 #include "ruby/re.h"
  14 #include "ruby/encoding.h"
  15 #include "ruby/util.h"
  16 #include "regint.h"
  17 #include <ctype.h>
  18
  19 VALUE rb_eRegexpError;
  20
  21 typedef char onig_errmsg_buffer[ONIG_MAX_ERROR_MESSAGE_LEN];
  22
  23 #define BEG(no) regs->beg[no]
  24 #define END(no) regs->end[no]
  25
  26 #if 'a' == 97   /* it's ascii */
  27 static const char casetable[] = {
  28         '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
  29         '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
  30         '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
  31         '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
  32         /* ' '     '!'     '"'     '#'     '$'     '%'     '&'     ''' */
  33         '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
  34         /* '('     ')'     '*'     '+'     ','     '-'     '.'     '/' */
  35         '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
  36         /* '0'     '1'     '2'     '3'     '4'     '5'     '6'     '7' */
  37         '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
  38         /* '8'     '9'     ':'     ';'     '<'     '='     '>'     '?' */
  39         '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
  40         /* '@'     'A'     'B'     'C'     'D'     'E'     'F'     'G' */
  41         '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
  42         /* 'H'     'I'     'J'     'K'     'L'     'M'     'N'     'O' */
  43         '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
  44         /* 'P'     'Q'     'R'     'S'     'T'     'U'     'V'     'W' */
  45         '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
  46         /* 'X'     'Y'     'Z'     '['     '\'     ']'     '^'     '_' */
  47         '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137',
  48         /* '`'     'a'     'b'     'c'     'd'     'e'     'f'     'g' */
  49         '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
  50         /* 'h'     'i'     'j'     'k'     'l'     'm'     'n'     'o' */
  51         '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
  52         /* 'p'     'q'     'r'     's'     't'     'u'     'v'     'w' */
  53         '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
  54         /* 'x'     'y'     'z'     '{'     '|'     '}'     '~' */
  55         '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177',
  56         '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
  57         '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
  58         '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
  59         '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
  60         '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
  61         '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
  62         '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
  63         '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
  64         '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
  65         '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
  66         '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327',
  67         '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337',
  68         '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
  69         '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
  70         '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
  71         '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377',
  72 };
  73 #else
  74 # error >>> "You lose. You will need a translation table for your character set." <<<
  75 #endif
  76
  77 int
  78 rb_memcicmp(const void *x, const void *y, long len)
  79 {
  80     const unsigned char *p1 = x, *p2 = y;
  81     int tmp;
  82
  83     while (len--) {
  84         if ((tmp = casetable[(unsigned)*p1++] - casetable[(unsigned)*p2++]))
  85             return tmp;
  86     }
  87     return 0;
  88 }
  89
  90 #undef rb_memcmp
  91
  92 int
  93 rb_memcmp(const void *p1, const void *p2, long len)
  94 {
  95     return memcmp(p1, p2, len);
  96 }
  97
  98 static inline long
  99 rb_memsearch_ss(const unsigned char *xs, long m, const unsigned char *ys, long n)
 100 {
 101     const unsigned char *x = xs, *xe = xs + m;
 102     const unsigned char *y = ys, *ye = ys + n;
 103 #ifndef VALUE_MAX
 104 # if SIZEOF_VALUE == 8
 105 #  define VALUE_MAX 0xFFFFFFFFFFFFFFFFULL
 106 # elif SIZEOF_VALUE == 4
 107 #  define VALUE_MAX 0xFFFFFFFFUL
 108 # endif
 109 #endif
 110     VALUE hx, hy, mask = VALUE_MAX >> ((SIZEOF_VALUE - m) * CHAR_BIT);
 111
 112     if (m > SIZEOF_VALUE)
 113         rb_bug("!!too long pattern string!!");
 114
 115         /* Prepare hash value */
 116     for (hx = *x++, hy = *y++; x < xe; ++x, ++y) {
 117         hx <<= CHAR_BIT;
 118         hy <<= CHAR_BIT;
 119         hx |= *x;
 120         hy |= *y;
 121     }
 122     /* Searching */
 123     while (hx != hy) {
 124         if (y == ye)
 125             return -1;
 126         hy <<= CHAR_BIT;
 127         hy |= *y;
 128         hy &= mask;
 129         y++;
 130     }
 131     return y - ys - m;
 132 }
 133
 134 static inline long
 135 rb_memsearch_qs(const unsigned char *xs, long m, const unsigned char *ys, long n)
 136 {
 137     const unsigned char *x = xs, *xe = xs + m;
 138     const unsigned char *y = ys;
 139     VALUE i, qstable[256];
 140
 141     /* Preprocessing */
 142     for (i = 0; i < 256; ++i)
 143         qstable[i] = m + 1;
 144     for (; x < xe; ++x)
 145         qstable[*x] = xe - x;
 146     /* Searching */
 147     for (; y + m <= ys + n; y += *(qstable + y[m])) {
 148         if (*xs == *y && memcmp(xs, y, m) == 0)
 149             return y - ys;
 150     }
 151     return -1;
 152 }
 153
 154 static inline unsigned int
 155 rb_memsearch_qs_utf8_hash(const unsigned char *x)
 156 {
 157     register const unsigned int mix = 8353;
 158     register unsigned int h = *x;
 159     if (h < 0xC0) {
 160         return h + 256;
 161     }
 162     else if (h < 0xE0) {
 163         h *= mix;
 164         h += x[1];
 165     }
 166     else if (h < 0xF0) {
 167         h *= mix;
 168         h += x[1];
 169         h *= mix;
 170         h += x[2];
 171     }
 172     else if (h < 0xF5) {
 173         h *= mix;
 174         h += x[1];
 175         h *= mix;
 176         h += x[2];
 177         h *= mix;
 178         h += x[3];
 179     }
 180     else {
 181         return h + 256;
 182     }
 183     return (unsigned char)h;
 184 }
 185
 186 static inline long
 187 rb_memsearch_qs_utf8(const unsigned char *xs, long m, const unsigned char *ys, long n)
 188 {
 189     const unsigned char *x = xs, *xe = xs + m;
 190     const unsigned char *y = ys;
 191     VALUE i, qstable[512];
 192
 193     /* Preprocessing */
 194     for (i = 0; i < 512; ++i) {
 195         qstable[i] = m + 1;
 196     }
 197     for (; x < xe; ++x) {
 198         qstable[rb_memsearch_qs_utf8_hash(x)] = xe - x;
 199     }
 200     /* Searching */
 201     for (; y + m <= ys + n; y += qstable[rb_memsearch_qs_utf8_hash(y+m)]) {
 202         if (*xs == *y && memcmp(xs, y, m) == 0)
 203             return y - ys;
 204     }
 205     return -1;
 206 }
 207
 208 long
 209 rb_memsearch(const void *x0, long m, const void *y0, long n, rb_encoding *enc)
 210 {
 211     const unsigned char *x = x0, *y = y0;
 212
 213     if (m > n) return -1;
 214     else if (m == n) {
 215         return memcmp(x0, y0, m) == 0 ? 0 : -1;
 216     }
 217     else if (m < 1) {
 218         return 0;
 219     }
 220     else if (m == 1) {
 221         const unsigned char *ys = y, *ye = ys + n;
 222         for (; y < ye; ++y) {
 223             if (*x == *y)
 224                 return y - ys;
 225         }
 226         return -1;
 227     }
 228     else if (m <= SIZEOF_VALUE) {
 229         return rb_memsearch_ss(x0, m, y0, n);
 230     }
 231     else if (enc == rb_utf8_encoding()){
 232         return rb_memsearch_qs_utf8(x0, m, y0, n);
 233     }
 234     else {
 235         return rb_memsearch_qs(x0, m, y0, n);
 236     }
 237 }
 238
 239 #define REG_LITERAL FL_USER5
 240 #define REG_ENCODING_NONE FL_USER6
 241 #define REG_BUSY FL_USER7
 242
 243 #define KCODE_FIXED FL_USER4
 244
 245 #define ARG_REG_OPTION_MASK \
 246     (ONIG_OPTION_IGNORECASE|ONIG_OPTION_MULTILINE|ONIG_OPTION_EXTEND)
 247 #define ARG_ENCODING_FIXED    16
 248 #define ARG_ENCODING_NONE     32
 249
 250 static int
 251 char_to_option(int c)
 252 {
 253     int val;
 254
 255     switch (c) {
 256       case 'i':
 257         val = ONIG_OPTION_IGNORECASE;
 258         break;
 259       case 'x':
 260         val = ONIG_OPTION_EXTEND;
 261         break;
 262       case 'm':
 263         val = ONIG_OPTION_MULTILINE;
 264         break;
 265       default:
 266         val = 0;
 267         break;
 268     }
 269     return val;
 270 }
 271
 272 static char *
 273 option_to_str(char str[4], int options)
 274 {
 275     char *p = str;
 276     if (options & ONIG_OPTION_MULTILINE) *p++ = 'm';
 277     if (options & ONIG_OPTION_IGNORECASE) *p++ = 'i';
 278     if (options & ONIG_OPTION_EXTEND) *p++ = 'x';
 279     *p = 0;
 280     return str;
 281 }
 282
 283 extern int
 284 rb_char_to_option_kcode(int c, int *option, int *kcode)
 285 {
 286     *option = 0;
 287
 288     switch (c) {
 289       case 'n':
 290         *kcode = -1;
 291         return (*option = ARG_ENCODING_NONE);
 292       case 'e':
 293         *kcode = rb_enc_find_index("EUC-JP");
 294         break;
 295       case 's':
 296         *kcode = rb_enc_find_index("Windows-31J");
 297         break;
 298       case 'u':
 299         *kcode = rb_enc_find_index("UTF-8");
 300         break;
 301       default:
 302         *kcode = -1;
 303         return (*option = char_to_option(c));
 304     }
 305     *option = ARG_ENCODING_FIXED;
 306     return 1;
 307 }
 308
 309 static void
 310 rb_reg_check(VALUE re)
 311 {
 312     if (!RREGEXP(re)->ptr || !RREGEXP(re)->str) {
 313         rb_raise(rb_eTypeError, "uninitialized Regexp");
 314     }
 315 }
 316
 317 static void
 318 rb_reg_expr_str(VALUE str, const char *s, long len)
 319 {
 320     rb_encoding *enc = rb_enc_get(str);
 321     const char *p, *pend;
 322     int need_escape = 0;
 323     int c, clen;
 324
 325     p = s; pend = p + len;
 326     while (p<pend) {
 327         c = rb_enc_ascget(p, pend, &clen, enc);
 328         if (c == -1) {
 329             p += mbclen(p, pend, enc);
 330         }
 331         else if (c != '/' && rb_enc_isprint(c, enc)) {
 332             p += clen;
 333         }
 334         else {
 335             need_escape = 1;
 336             break;
 337         }
 338     }
 339     if (!need_escape) {
 340         rb_str_buf_cat(str, s, len);
 341     }
 342     else {
 343         p = s;
 344         while (p<pend) {
 345             c = rb_enc_ascget(p, pend, &clen, enc);
 346             if (c == '\\' && p+clen < pend) {
 347                 int n = clen + mbclen(p+clen, pend, enc);
 348                 rb_str_buf_cat(str, p, n);
 349                 p += n;
 350                 continue;
 351             }
 352             else if (c == '/') {
 353                 char c = '\\';
 354                 rb_str_buf_cat(str, &c, 1);
 355                 rb_str_buf_cat(str, p, clen);
 356             }
 357             else if (c == -1) {
 358                 int l = mbclen(p, pend, enc);
 359                 rb_str_buf_cat(str, p, l);
 360                 p += l;
 361                 continue;
 362             }
 363             else if (rb_enc_isprint(c, enc)) {
 364                 rb_str_buf_cat(str, p, clen);
 365             }
 366             else if (!rb_enc_isspace(c, enc)) {
 367                 char b[8];
 368
 369                 sprintf(b, "\\x%02X", c);
 370                 rb_str_buf_cat(str, b, 4);
 371             }
 372             else {
 373                 rb_str_buf_cat(str, p, clen);
 374             }
 375             p += clen;
 376         }
 377     }
 378 }
 379
 380 static VALUE
 381 rb_reg_desc(const char *s, long len, VALUE re)
 382 {
 383     VALUE str = rb_str_buf_new2("/");
 384
 385     rb_enc_copy(str, re);
 386     rb_reg_expr_str(str, s, len);
 387     rb_str_buf_cat2(str, "/");
 388     if (re) {
 389         char opts[4];
 390         rb_reg_check(re);
 391         if (*option_to_str(opts, RREGEXP(re)->ptr->options))
 392             rb_str_buf_cat2(str, opts);
 393     }
 394     OBJ_INFECT(str, re);
 395     return str;
 396 }
 397
 398
 399 /*
 400  *  call-seq:
 401  *      rxp.source   => str
 402  *
 403  *  Returns the original string of the pattern.
 404  *
 405  *      /ab+c/ix.source #=> "ab+c"
 406  *
 407  *  Note that escape sequences are retained as is.
 408  *
 409  *     /\x20\+/.source  #=> "\\x20\\+"
 410  *
 411  */
 412
 413 static VALUE
 414 rb_reg_source(VALUE re)
 415 {
 416     VALUE str;
 417
 418     rb_reg_check(re);
 419     str = rb_enc_str_new(RREGEXP(re)->str,RREGEXP(re)->len, rb_enc_get(re));
 420     if (OBJ_TAINTED(re)) OBJ_TAINT(str);
 421     return str;
 422 }
 423
 424 /*
 425  * call-seq:
 426  *    rxp.inspect   => string
 427  *
 428  * Produce a nicely formatted string-version of _rxp_. Perhaps surprisingly,
 429  * <code>#inspect</code> actually produces the more natural version of
 430  * the string than <code>#to_s</code>.
 431  *
 432  *      /ab+c/ix.inspect        #=> "/ab+c/ix"
 433  *
 434  */
 435
 436 static VALUE
 437 rb_reg_inspect(VALUE re)
 438 {
 439     rb_reg_check(re);
 440     return rb_reg_desc(RREGEXP(re)->str, RREGEXP(re)->len, re);
 441 }
 442
 443
 444 /*
 445  *  call-seq:
 446  *     rxp.to_s   => str
 447  *
 448  *  Returns a string containing the regular expression and its options (using the
 449  *  <code>(?opts:source)</code> notation. This string can be fed back in to
 450  *  <code>Regexp::new</code> to a regular expression with the same semantics as
 451  *  the original. (However, <code>Regexp#==</code> may not return true when
 452  *  comparing the two, as the source of the regular expression itself may
 453  *  differ, as the example shows).  <code>Regexp#inspect</code> produces a
 454  *  generally more readable version of <i>rxp</i>.
 455  *
 456  *      r1 = /ab+c/ix           #=> /ab+c/ix
 457  *      s1 = r1.to_s            #=> "(?ix-m:ab+c)"
 458  *      r2 = Regexp.new(s1)     #=> /(?ix-m:ab+c)/
 459  *      r1 == r2                #=> false
 460  *      r1.source               #=> "ab+c"
 461  *      r2.source               #=> "(?ix-m:ab+c)"
 462  */
 463
 464 static VALUE
 465 rb_reg_to_s(VALUE re)
 466 {
 467     int options, opt;
 468     const int embeddable = ONIG_OPTION_MULTILINE|ONIG_OPTION_IGNORECASE|ONIG_OPTION_EXTEND;
 469     long len;
 470     const UChar* ptr;
 471     VALUE str = rb_str_buf_new2("(?");
 472     char optbuf[5];
 473
 474     rb_reg_check(re);
 475
 476     rb_enc_copy(str, re);
 477     options = RREGEXP(re)->ptr->options;
 478     ptr = (UChar*)RREGEXP(re)->str;
 479     len = RREGEXP(re)->len;
 480   again:
 481     if (len >= 4 && ptr[0] == '(' && ptr[1] == '?') {
 482         int err = 1;
 483         ptr += 2;
 484         if ((len -= 2) > 0) {
 485             do {
 486                 opt = char_to_option((int )*ptr);
 487                 if (opt != 0) {
 488                     options |= opt;
 489                 }
 490                 else {
 491                     break;
 492                 }
 493                 ++ptr;
 494             } while (--len > 0);
 495         }
 496         if (len > 1 && *ptr == '-') {
 497             ++ptr;
 498             --len;
 499             do {
 500                 opt = char_to_option((int )*ptr);
 501                 if (opt != 0) {
 502                     options &= ~opt;
 503                 }
 504                 else {
 505                     break;
 506                 }
 507                 ++ptr;
 508             } while (--len > 0);
 509         }
 510         if (*ptr == ')') {
 511             --len;
 512             ++ptr;
 513             goto again;
 514         }
 515         if (*ptr == ':' && ptr[len-1] == ')') {
 516             int r;
 517             Regexp *rp;
 518             r = onig_alloc_init(&rp, ONIG_OPTION_DEFAULT,
 519                                 ONIGENC_CASE_FOLD_DEFAULT,
 520                                 rb_enc_get(re),
 521                                 OnigDefaultSyntax);
 522             if (r == 0) {
 523                  ++ptr;
 524                  len -= 2;
 525                  err = (onig_compile(rp, ptr, ptr + len, NULL) != 0);
 526             }
 527             onig_free(rp);
 528         }
 529         if (err) {
 530             options = RREGEXP(re)->ptr->options;
 531             ptr = (UChar*)RREGEXP(re)->str;
 532             len = RREGEXP(re)->len;
 533         }
 534     }
 535
 536     if (*option_to_str(optbuf, options)) rb_str_buf_cat2(str, optbuf);
 537
 538     if ((options & embeddable) != embeddable) {
 539         optbuf[0] = '-';
 540         option_to_str(optbuf + 1, ~options);
 541         rb_str_buf_cat2(str, optbuf);
 542     }
 543
 544     rb_str_buf_cat2(str, ":");
 545     rb_reg_expr_str(str, (char*)ptr, len);
 546     rb_str_buf_cat2(str, ")");
 547     rb_enc_copy(str, re);
 548
 549     OBJ_INFECT(str, re);
 550     return str;
 551 }
 552
 553 static void
 554 rb_reg_raise(const char *s, long len, const char *err, VALUE re)
 555 {
 556     VALUE desc = rb_reg_desc(s, len, re);
 557
 558     rb_raise(rb_eRegexpError, "%s: %s", err, RSTRING_PTR(desc));
 559 }
 560
 561 static VALUE
 562 rb_enc_reg_error_desc(const char *s, long len, rb_encoding *enc, int options, const char *err)
 563 {
 564     char opts[6];
 565     VALUE desc = rb_str_buf_new2(err);
 566
 567     rb_enc_associate(desc, enc);
 568     rb_str_buf_cat2(desc, ": /");
 569     rb_reg_expr_str(desc, s, len);
 570     opts[0] = '/';
 571     option_to_str(opts + 1, options);
 572     rb_str_buf_cat2(desc, opts);
 573     return rb_exc_new3(rb_eRegexpError, desc);
 574 }
 575
 576 static void
 577 rb_enc_reg_raise(const char *s, long len, rb_encoding *enc, int options, const char *err)
 578 {
 579     rb_exc_raise(rb_enc_reg_error_desc(s, len, enc, options, err));
 580 }
 581
 582 static VALUE
 583 rb_reg_error_desc(VALUE str, int options, const char *err)
 584 {
 585     return rb_enc_reg_error_desc(RSTRING_PTR(str), RSTRING_LEN(str),
 586                                  rb_enc_get(str), options, err);
 587 }
 588
 589 static void
 590 rb_reg_raise_str(VALUE str, int options, const char *err)
 591 {
 592     rb_exc_raise(rb_reg_error_desc(str, options, err));
 593 }
 594
 595
 596 /*
 597  *  call-seq:
 598  *     rxp.casefold?   => true or false
 599  *
 600  *  Returns the value of the case-insensitive flag.
 601  *
 602  *      /a/.casefold?           #=> false
 603  *      /a/i.casefold?          #=> true
 604  *      /(?i:a)/.casefold?      #=> false
 605  */
 606
 607 static VALUE
 608 rb_reg_casefold_p(VALUE re)
 609 {
 610     rb_reg_check(re);
 611     if (RREGEXP(re)->ptr->options & ONIG_OPTION_IGNORECASE) return Qtrue;
 612     return Qfalse;
 613 }
 614
 615
 616 /*
 617  *  call-seq:
 618  *     rxp.options   => fixnum
 619  *
 620  *  Returns the set of bits corresponding to the options used when creating this
 621  *  Regexp (see <code>Regexp::new</code> for details. Note that additional bits
 622  *  may be set in the returned options: these are used internally by the regular
 623  *  expression code. These extra bits are ignored if the options are passed to
 624  *  <code>Regexp::new</code>.
 625  *
 626  *     Regexp::IGNORECASE                  #=> 1
 627  *     Regexp::EXTENDED                    #=> 2
 628  *     Regexp::MULTILINE                   #=> 4
 629  *
 630  *     /cat/.options                       #=> 0
 631  *     /cat/ix.options                     #=> 3
 632  *     Regexp.new('cat', true).options     #=> 1
 633  *     /\xa1\xa2/e.options                 #=> 16
 634  *
 635  *     r = /cat/ix
 636  *     Regexp.new(r.source, r.options)     #=> /cat/ix
 637  */
 638
 639 static VALUE
 640 rb_reg_options_m(VALUE re)
 641 {
 642     int options = rb_reg_options(re);
 643     return INT2NUM(options);
 644 }
 645
 646 static int
 647 reg_names_iter(const OnigUChar *name, const OnigUChar *name_end,
 648           int back_num, int *back_refs, OnigRegex regex, void *arg)
 649 {
 650     VALUE ary = (VALUE)arg;
 651     rb_ary_push(ary, rb_str_new((const char *)name, name_end-name));
 652     return 0;
 653 }
 654
 655 /*
 656  * call-seq:
 657  *    rxp.names   => [name1, name2, ...]
 658  *
 659  * Returns a list of names of captures as an array of strings.
 660  *
 661  *     /(?<foo>.)(?<bar>.)(?<baz>.)/.names
 662  *     #=> ["foo", "bar", "baz"]
 663  *
 664  *     /(?<foo>.)(?<foo>.)/.names
 665  *     #=> ["foo"]
 666  *
 667  *     /(.)(.)/.names
 668  *     #=> []
 669  */
 670
 671 static VALUE
 672 rb_reg_names(VALUE re)
 673 {
 674     VALUE ary = rb_ary_new();
 675     onig_foreach_name(RREGEXP(re)->ptr, reg_names_iter, (void*)ary);
 676     return ary;
 677 }
 678
 679 static int
 680 reg_named_captures_iter(const OnigUChar *name, const OnigUChar *name_end,
 681           int back_num, int *back_refs, OnigRegex regex, void *arg)
 682 {
 683     VALUE hash = (VALUE)arg;
 684     VALUE ary = rb_ary_new2(back_num);
 685     int i;
 686
 687     for(i = 0; i < back_num; i++)
 688         rb_ary_store(ary, i, INT2NUM(back_refs[i]));
 689
 690     rb_hash_aset(hash, rb_str_new((const char*)name, name_end-name),ary);
 691
 692     return 0;
 693 }
 694
 695 /*
 696  * call-seq:
 697  *    rxp.named_captures  => hash
 698  *
 699  * Returns a hash representing information about named captures of <i>rxp</i>.
 700  *
 701  * A key of the hash is a name of the named captures.
 702  * A value of the hash is an array which is list of indexes of corresponding
 703  * named captures.
 704  *
 705  *    /(?<foo>.)(?<bar>.)/.named_captures
 706  *    #=> {"foo"=>[1], "bar"=>[2]}
 707  *
 708  *    /(?<foo>.)(?<foo>.)/.named_captures
 709  *    #=> {"foo"=>[1, 2]}
 710  *
 711  * If there are no named captures, an empty hash is returned.
 712  *
 713  *    /(.)(.)/.named_captures
 714  *    #=> {}
 715  */
 716
 717 static VALUE
 718 rb_reg_named_captures(VALUE re)
 719 {
 720     VALUE hash = rb_hash_new();
 721     onig_foreach_name(RREGEXP(re)->ptr, reg_named_captures_iter, (void*)hash);
 722     return hash;
 723 }
 724
 725 static Regexp*
 726 make_regexp(const char *s, long len, rb_encoding *enc, int flags, onig_errmsg_buffer err)
 727 {
 728     Regexp *rp;
 729     int r;
 730     OnigErrorInfo einfo;
 731
 732     /* Handle escaped characters first. */
 733
 734     /* Build a copy of the string (in dest) with the
 735        escaped characters translated,  and generate the regex
 736        from that.
 737     */
 738
 739     r = onig_alloc_init(&rp, flags, ONIGENC_CASE_FOLD_DEFAULT,
 740                         enc, OnigDefaultSyntax);
 741     if (r) {
 742         onig_error_code_to_str((UChar*)err, r);
 743         return 0;
 744     }
 745
 746     r = onig_compile(rp, (UChar*)s, (UChar*)(s + len), &einfo);
 747
 748     if (r != 0) {
 749         onig_free(rp);
 750         (void )onig_error_code_to_str((UChar*)err, r, &einfo);
 751         return 0;
 752     }
 753     return rp;
 754 }
 755
 756
 757 /*
 758  *  Document-class: MatchData
 759  *
 760  *  <code>MatchData</code> is the type of the special variable <code>$~</code>,
 761  *  and is the type of the object returned by <code>Regexp#match</code> and
 762  *  <code>Regexp.last_match</code>. It encapsulates all the results of a pattern
 763  *  match, results normally accessed through the special variables
 764  *  <code>$&</code>, <code>$'</code>, <code>$`</code>, <code>$1</code>,
 765  *  <code>$2</code>, and so on.
 766  *
 767  */
 768
 769 VALUE rb_cMatch;
 770
 771 static VALUE
 772 match_alloc(VALUE klass)
 773 {
 774     NEWOBJ(match, struct RMatch);
 775     OBJSETUP(match, klass, T_MATCH);
 776
 777     match->str = 0;
 778     match->rmatch = 0;
 779     match->regexp = 0;
 780     match->rmatch = ALLOC(struct rmatch);
 781     MEMZERO(match->rmatch, struct rmatch, 1);
 782
 783     return (VALUE)match;
 784 }
 785
 786 typedef struct {
 787     int byte_pos;
 788     int char_pos;
 789 } pair_t;
 790
 791 static int
 792 pair_byte_cmp(const void *pair1, const void *pair2)
 793 {
 794     return ((pair_t*)pair1)->byte_pos - ((pair_t*)pair2)->byte_pos;
 795 }
 796
 797 static void
 798 update_char_offset(VALUE match)
 799 {
 800     struct rmatch *rm = RMATCH(match)->rmatch;
 801     struct re_registers *regs;
 802     int num_regs;
 803     int i, num_pos, c;
 804     char *s, *p, *q, *e;
 805     rb_encoding *enc;
 806     pair_t *pairs;
 807
 808     if (rm->char_offset_updated)
 809         return;
 810
 811     regs = &rm->regs;
 812     num_regs = rm->regs.num_regs;
 813
 814     if (rm->char_offset_num_allocated < num_regs) {
 815         REALLOC_N(rm->char_offset, struct rmatch_offset, num_regs);
 816         rm->char_offset_num_allocated = num_regs;
 817     }
 818
 819     enc = rb_enc_get(RMATCH(match)->str);
 820     if (rb_enc_mbmaxlen(enc) == 1) {
 821         for (i = 0; i < num_regs; i++) {
 822             rm->char_offset[i].beg = BEG(i);
 823             rm->char_offset[i].end = END(i);
 824         }
 825         rm->char_offset_updated = 1;
 826         return;
 827     }
 828
 829     pairs = ALLOCA_N(pair_t, num_regs*2);
 830     num_pos = 0;
 831     for (i = 0; i < num_regs; i++) {
 832         if (BEG(i) < 0)
 833             continue;
 834         pairs[num_pos++].byte_pos = BEG(i);
 835         pairs[num_pos++].byte_pos = END(i);
 836     }
 837     qsort(pairs, num_pos, sizeof(pair_t), pair_byte_cmp);
 838
 839     s = p = RSTRING_PTR(RMATCH(match)->str);
 840     e = s + RSTRING_LEN(RMATCH(match)->str);
 841     c = 0;
 842     for (i = 0; i < num_pos; i++) {
 843         q = s + pairs[i].byte_pos;
 844         c += rb_enc_strlen(p, q, enc);
 845         pairs[i].char_pos = c;
 846         p = q;
 847     }
 848
 849     for (i = 0; i < num_regs; i++) {
 850         pair_t key, *found;
 851         if (BEG(i) < 0) {
 852             rm->char_offset[i].beg = -1;
 853             rm->char_offset[i].end = -1;
 854             continue;
 855         }
 856
 857         key.byte_pos = BEG(i);
 858         found = bsearch(&key, pairs, num_pos, sizeof(pair_t), pair_byte_cmp);
 859         rm->char_offset[i].beg = found->char_pos;
 860
 861         key.byte_pos = END(i);
 862         found = bsearch(&key, pairs, num_pos, sizeof(pair_t), pair_byte_cmp);
 863         rm->char_offset[i].end = found->char_pos;
 864     }
 865
 866     rm->char_offset_updated = 1;
 867 }
 868
 869 /* :nodoc: */
 870 static VALUE
 871 match_init_copy(VALUE obj, VALUE orig)
 872 {
 873     struct rmatch *rm;
 874
 875     if (obj == orig) return obj;
 876
 877     if (!rb_obj_is_instance_of(orig, rb_obj_class(obj))) {
 878         rb_raise(rb_eTypeError, "wrong argument class");
 879     }
 880     RMATCH(obj)->str = RMATCH(orig)->str;
 881     RMATCH(obj)->regexp = RMATCH(orig)->regexp;
 882
 883     rm = RMATCH(obj)->rmatch;
 884     onig_region_free(&rm->regs, 0);
 885     rm->regs.allocated = 0;
 886
 887     onig_region_copy(&rm->regs, RMATCH_REGS(orig));
 888
 889     if (!RMATCH(orig)->rmatch->char_offset_updated) {
 890         rm->char_offset_updated = 0;
 891     }
 892     else {
 893         if (rm->char_offset_num_allocated < rm->regs.num_regs) {
 894             REALLOC_N(rm->char_offset, struct rmatch_offset, rm->regs.num_regs);
 895             rm->char_offset_num_allocated = rm->regs.num_regs;
 896         }
 897         MEMCPY(rm->char_offset, RMATCH(orig)->rmatch->char_offset,
 898                struct rmatch_offset, rm->regs.num_regs);
 899         rm->char_offset_updated = 1;
 900     }
 901
 902     return obj;
 903 }
 904
 905
 906 /*
 907  * call-seq:
 908  *    mtch.regexp   => regexp
 909  *
 910  * Returns the regexp.
 911  *
 912  *     m = /a.*b/.match("abc")
 913  *     m.regexp #=> /a.*b/
 914  */
 915
 916 static VALUE
 917 match_regexp(VALUE match)
 918 {
 919     return RMATCH(match)->regexp;
 920 }
 921
 922 /*
 923  * call-seq:
 924  *    mtch.names   => [name1, name2, ...]
 925  *
 926  * Returns a list of names of captures as an array of strings.
 927  * It is same as mtch.regexp.names.
 928  *
 929  *     /(?<foo>.)(?<bar>.)(?<baz>.)/.match("hoge").names
 930  *     #=> ["foo", "bar", "baz"]
 931  *
 932  *     m = /(?<x>.)(?<y>.)?/.match("a") #=> #<MatchData "a" x:"a" y:nil>
 933  *     m.names                          #=> ["x", "y"]
 934  */
 935
 936 static VALUE
 937 match_names(VALUE match)
 938 {
 939     return rb_reg_names(RMATCH(match)->regexp);
 940 }
 941
 942 /*
 943  *  call-seq:
 944  *     mtch.length   => integer
 945  *     mtch.size     => integer
 946  *
 947  *  Returns the number of elements in the match array.
 948  *
 949  *     m = /(.)(.)(\d+)(\d)/.match("THX1138.")
 950  *     m.length   #=> 5
 951  *     m.size     #=> 5
 952  */
 953
 954 static VALUE
 955 match_size(VALUE match)
 956 {
 957     return INT2FIX(RMATCH_REGS(match)->num_regs);
 958 }
 959
 960 static int
 961 match_backref_number(VALUE match, VALUE backref)
 962 {
 963     const char *name;
 964     int num;
 965
 966     struct re_registers *regs = RMATCH_REGS(match);
 967     VALUE regexp = RMATCH(match)->regexp;
 968
 969     switch(TYPE(backref)) {
 970       default:
 971         return NUM2INT(backref);
 972
 973       case T_SYMBOL:
 974         name = rb_id2name(SYM2ID(backref));
 975         break;
 976
 977       case T_STRING:
 978         name = StringValueCStr(backref);
 979         break;
 980     }
 981
 982     num = onig_name_to_backref_number(RREGEXP(regexp)->ptr,
 983               (const unsigned char*)name,
 984               (const unsigned char*)name + strlen(name),
 985               regs);
 986
 987     if (num < 1) {
 988         rb_raise(rb_eIndexError, "undefined group name reference: %s", name);
 989     }
 990
 991     return num;
 992 }
 993
 994
 995 /*
 996  *  call-seq:
 997  *     mtch.offset(n)   => array
 998  *
 999  *  Returns a two-element array containing the beginning and ending offsets of
1000  *  the <em>n</em>th match.
1001  *  <em>n</em> can be a string or symbol to reference a named capture.
1002  *
1003  *     m = /(.)(.)(\d+)(\d)/.match("THX1138.")
1004  *     m.offset(0)      #=> [1, 7]
1005  *     m.offset(4)      #=> [6, 7]
1006  *
1007  *     m = /(?<foo>.)(.)(?<bar>.)/.match("hoge")
1008  *     p m.offset(:foo) #=> [0, 1]
1009  *     p m.offset(:bar) #=> [2, 3]
1010  *
1011  */
1012
1013 static VALUE
1014 match_offset(VALUE match, VALUE n)
1015 {
1016     int i = match_backref_number(match, n);
1017     struct re_registers *regs = RMATCH_REGS(match);
1018
1019     if (i < 0 || regs->num_regs <= i)
1020         rb_raise(rb_eIndexError, "index %d out of matches", i);
1021
1022     if (BEG(i) < 0)
1023         return rb_assoc_new(Qnil, Qnil);
1024
1025     update_char_offset(match);
1026     return rb_assoc_new(INT2FIX(RMATCH(match)->rmatch->char_offset[i].beg),
1027                         INT2FIX(RMATCH(match)->rmatch->char_offset[i].end));
1028 }
1029
1030
1031 /*
1032  *  call-seq:
1033  *     mtch.begin(n)   => integer
1034  *
1035  *  Returns the offset of the start of the <em>n</em>th element of the match
1036  *  array in the string.
1037  *  <em>n</em> can be a string or symbol to reference a named capture.
1038  *
1039  *     m = /(.)(.)(\d+)(\d)/.match("THX1138.")
1040  *     m.begin(0)       #=> 1
1041  *     m.begin(2)       #=> 2
1042  *
1043  *     m = /(?<foo>.)(.)(?<bar>.)/.match("hoge")
1044  *     p m.begin(:foo)  #=> 0
1045  *     p m.begin(:bar)  #=> 2
1046  */
1047
1048 static VALUE
1049 match_begin(VALUE match, VALUE n)
1050 {
1051     int i = match_backref_number(match, n);
1052     struct re_registers *regs = RMATCH_REGS(match);
1053
1054     if (i < 0 || regs->num_regs <= i)
1055         rb_raise(rb_eIndexError, "index %d out of matches", i);
1056
1057     if (BEG(i) < 0)
1058         return Qnil;
1059
1060     update_char_offset(match);
1061     return INT2FIX(RMATCH(match)->rmatch->char_offset[i].beg);
1062 }
1063
1064
1065 /*
1066  *  call-seq:
1067  *     mtch.end(n)   => integer
1068  *
1069  *  Returns the offset of the character immediately following the end of the
1070  *  <em>n</em>th element of the match array in the string.
1071  *  <em>n</em> can be a string or symbol to reference a named capture.
1072  *
1073  *     m = /(.)(.)(\d+)(\d)/.match("THX1138.")
1074  *     m.end(0)         #=> 7
1075  *     m.end(2)         #=> 3
1076  *
1077  *     m = /(?<foo>.)(.)(?<bar>.)/.match("hoge")
1078  *     p m.end(:foo)    #=> 1
1079  *     p m.end(:bar)    #=> 3
1080  */
1081
1082 static VALUE
1083 match_end(VALUE match, VALUE n)
1084 {
1085     int i = match_backref_number(match, n);
1086     struct re_registers *regs = RMATCH_REGS(match);
1087
1088     if (i < 0 || regs->num_regs <= i)
1089         rb_raise(rb_eIndexError, "index %d out of matches", i);
1090
1091     if (BEG(i) < 0)
1092         return Qnil;
1093
1094     update_char_offset(match);
1095     return INT2FIX(RMATCH(match)->rmatch->char_offset[i].end);
1096 }
1097
1098 #define MATCH_BUSY FL_USER2
1099
1100 void
1101 rb_match_busy(VALUE match)
1102 {
1103     FL_SET(match, MATCH_BUSY);
1104 }
1105
1106 /*
1107  *  call-seq:
1108  *     rxp.fixed_encoding?   => true or false
1109  *
1110  *  Returns false if rxp is applicable to
1111  *  a string with any ASCII compatible encoding.
1112  *  Returns true otherwise.
1113  *
1114  *      r = /a/
1115  *      r.fixed_encoding?                               #=> false
1116  *      r =~ "\u{6666} a"                               #=> 2
1117  *      r =~ "\xa1\xa2 a".force_encoding("euc-jp")      #=> 2
1118  *      r =~ "abc".force_encoding("euc-jp")             #=> 0
1119  *
1120  *      r = /a/u
1121  *      r.fixed_encoding?                               #=> true
1122  *      r.encoding                                      #=> #<Encoding:UTF-8>
1123  *      r =~ "\u{6666} a"                               #=> 2
1124  *      r =~ "\xa1\xa2".force_encoding("euc-jp")        #=> ArgumentError
1125  *      r =~ "abc".force_encoding("euc-jp")             #=> 0
1126  *
1127  *      r = /\u{6666}/
1128  *      r.fixed_encoding?                               #=> true
1129  *      r.encoding                                      #=> #<Encoding:UTF-8>
1130  *      r =~ "\u{6666} a"                               #=> 0
1131  *      r =~ "\xa1\xa2".force_encoding("euc-jp")        #=> ArgumentError
1132  *      r =~ "abc".force_encoding("euc-jp")             #=> nil
1133  */
1134
1135 static VALUE
1136 rb_reg_fixed_encoding_p(VALUE re)
1137 {
1138     if (FL_TEST(re, KCODE_FIXED))
1139         return Qtrue;
1140     else
1141         return Qfalse;
1142 }
1143
1144 static VALUE
1145 rb_reg_preprocess(const char *p, const char *end, rb_encoding *enc,
1146         rb_encoding **fixed_enc, onig_errmsg_buffer err);
1147
1148
1149 static rb_encoding*
1150 rb_reg_prepare_enc(VALUE re, VALUE str, int warn)
1151 {
1152     rb_encoding *enc = 0;
1153
1154     if (rb_enc_str_coderange(str) == ENC_CODERANGE_BROKEN) {
1155         rb_raise(rb_eArgError,
1156             "broken %s string",
1157             rb_enc_name(rb_enc_get(str)));
1158     }
1159
1160     rb_reg_check(re);
1161     /* ignorecase status */
1162     if (rb_reg_fixed_encoding_p(re) || !rb_enc_str_asciicompat_p(str)) {
1163         if (ENCODING_GET(re) != rb_enc_get_index(str) &&
1164             rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT) {
1165             rb_raise(rb_eArgError,
1166                 "incompatible encoding regexp match (%s regexp with %s string)",
1167                 rb_enc_name(rb_enc_from_index(ENCODING_GET(re))),
1168                 rb_enc_name(rb_enc_get(str)));
1169         }
1170     }
1171     else {
1172         enc = rb_enc_get(str);
1173         if (warn && (RBASIC(re)->flags & REG_ENCODING_NONE) &&
1174             enc != rb_ascii8bit_encoding() &&
1175             rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT) {
1176             rb_warn("regexp match /.../n against to %s string",
1177                     rb_enc_name(enc));
1178         }
1179         return enc;
1180     }
1181     return RREGEXP(re)->ptr->enc;
1182 }
1183
1184 static regex_t *
1185 rb_reg_prepare_re(VALUE re, rb_encoding *enc)
1186 {
1187     regex_t *reg = RREGEXP(re)->ptr;
1188     onig_errmsg_buffer err = "";
1189     int r;
1190     OnigErrorInfo einfo;
1191     const char *pattern;
1192     VALUE unescaped;
1193     rb_encoding *fixed_enc = 0;
1194
1195     if (reg->enc == enc) return reg;
1196
1197     rb_reg_check(re);
1198     reg = RREGEXP(re)->ptr;
1199     pattern = RREGEXP(re)->str;
1200
1201     unescaped = rb_reg_preprocess(
1202         pattern, pattern + RREGEXP(re)->len, enc,
1203         &fixed_enc, err);
1204
1205     if (unescaped == Qnil) {
1206         rb_raise(rb_eArgError, "regexp preprocess failed: %s", err);
1207     }
1208
1209     r = onig_new(&reg, (UChar* )RSTRING_PTR(unescaped),
1210                  (UChar* )(RSTRING_PTR(unescaped) + RSTRING_LEN(unescaped)),
1211                  reg->options, enc,
1212                  OnigDefaultSyntax, &einfo);
1213     if (r) {
1214         onig_error_code_to_str((UChar*)err, r, &einfo);
1215         rb_reg_raise(pattern, RREGEXP(re)->len, err, re);
1216     }
1217
1218     RB_GC_GUARD(unescaped);
1219     return reg;
1220 }
1221
1222 int
1223 rb_reg_adjust_startpos(VALUE re, VALUE str, int pos, int reverse)
1224 {
1225     int range;
1226     rb_encoding *enc;
1227     UChar *p, *string;
1228
1229     enc = rb_reg_prepare_enc(re, str, 0);
1230
1231     if (reverse) {
1232         range = -pos;
1233     }
1234     else {
1235         range = RSTRING_LEN(str) - pos;
1236     }
1237
1238     if (pos > 0 && ONIGENC_MBC_MAXLEN(enc) != 1 && pos < RSTRING_LEN(str)) {
1239          string = (UChar*)RSTRING_PTR(str);
1240
1241          if (range > 0) {
1242               p = onigenc_get_right_adjust_char_head(enc, string, string + pos);
1243          }
1244          else {
1245               p = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, string, string + pos);
1246          }
1247          return p - string;
1248     }
1249
1250     return pos;
1251 }
1252
1253 int
1254 rb_reg_search(VALUE re, VALUE str, int pos, int reverse)
1255 {
1256     int result;
1257     VALUE match;
1258     struct re_registers regs;
1259     char *range = RSTRING_PTR(str);
1260     regex_t *reg0 = RREGEXP(re)->ptr, *reg;
1261     int busy = FL_TEST(re, REG_BUSY);
1262
1263     if (pos > RSTRING_LEN(str) || pos < 0) {
1264         rb_backref_set(Qnil);
1265         return -1;
1266     }
1267
1268     reg = rb_reg_prepare_re(re, rb_reg_prepare_enc(re, str, 1));
1269
1270     FL_SET(re, REG_BUSY);
1271     if (!reverse) {
1272         range += RSTRING_LEN(str);
1273     }
1274     MEMZERO(&regs, struct re_registers, 1);
1275     result = onig_search(reg,
1276                          (UChar*)(RSTRING_PTR(str)),
1277                          ((UChar*)(RSTRING_PTR(str)) + RSTRING_LEN(str)),
1278                          ((UChar*)(RSTRING_PTR(str)) + pos),
1279                          ((UChar*)range),
1280                          &regs, ONIG_OPTION_NONE);
1281
1282     if (RREGEXP(re)->ptr != reg) {
1283         if (busy) {
1284             onig_free(reg);
1285         }
1286         else {
1287             onig_free(reg0);
1288             RREGEXP(re)->ptr = reg;
1289         }
1290     }
1291     if (!busy) FL_UNSET(re, REG_BUSY);
1292     if (result < 0) {
1293         if (result == ONIG_MISMATCH) {
1294             rb_backref_set(Qnil);
1295             return result;
1296         }
1297         else {
1298             onig_errmsg_buffer err = "";
1299             onig_error_code_to_str((UChar*)err, result);
1300             rb_reg_raise(RREGEXP(re)->str, RREGEXP(re)->len, err, 0);
1301         }
1302     }
1303
1304     match = rb_backref_get();
1305     if (NIL_P(match) || FL_TEST(match, MATCH_BUSY)) {
1306         match = match_alloc(rb_cMatch);
1307     }
1308     else {
1309         if (rb_safe_level() >= 3)
1310             OBJ_TAINT(match);
1311         else
1312             FL_UNSET(match, FL_TAINT);
1313     }
1314
1315     onig_region_copy(RMATCH_REGS(match), &regs);
1316     RMATCH(match)->str = rb_str_new4(str);
1317     RMATCH(match)->regexp = re;
1318     RMATCH(match)->rmatch->char_offset_updated = 0;
1319     rb_backref_set(match);
1320
1321     OBJ_INFECT(match, re);
1322     OBJ_INFECT(match, str);
1323
1324     return result;
1325 }
1326
1327 VALUE
1328 rb_reg_nth_defined(int nth, VALUE match)
1329 {
1330     struct re_registers *regs;
1331     if (NIL_P(match)) return Qnil;
1332     regs = RMATCH_REGS(match);
1333     if (nth >= regs->num_regs) {
1334         return Qnil;
1335     }
1336     if (nth < 0) {
1337         nth += regs->num_regs;
1338         if (nth <= 0) return Qnil;
1339     }
1340     if (BEG(nth) == -1) return Qfalse;
1341     return Qtrue;
1342 }
1343
1344 VALUE
1345 rb_reg_nth_match(int nth, VALUE match)
1346 {
1347     VALUE str;
1348     long start, end, len;
1349     struct re_registers *regs;
1350
1351     if (NIL_P(match)) return Qnil;
1352     regs = RMATCH_REGS(match);
1353     if (nth >= regs->num_regs) {
1354         return Qnil;
1355     }
1356     if (nth < 0) {
1357         nth += regs->num_regs;
1358         if (nth <= 0) return Qnil;
1359     }
1360     start = BEG(nth);
1361     if (start == -1) return Qnil;
1362     end = END(nth);
1363     len = end - start;
1364     str = rb_str_subseq(RMATCH(match)->str, start, len);
1365     OBJ_INFECT(str, match);
1366     return str;
1367 }
1368
1369 VALUE
1370 rb_reg_last_match(VALUE match)
1371 {
1372     return rb_reg_nth_match(0, match);
1373 }
1374
1375
1376 /*
1377  *  call-seq:
1378  *     mtch.pre_match   => str
1379  *
1380  *  Returns the portion of the original string before the current match.
1381  *  Equivalent to the special variable <code>$`</code>.
1382  *
1383  *     m = /(.)(.)(\d+)(\d)/.match("THX1138.")
1384  *     m.pre_match   #=> "T"
1385  */
1386
1387 VALUE
1388 rb_reg_match_pre(VALUE match)
1389 {
1390     VALUE str;
1391     struct re_registers *regs;
1392
1393     if (NIL_P(match)) return Qnil;
1394     regs = RMATCH_REGS(match);
1395     if (BEG(0) == -1) return Qnil;
1396     str = rb_str_subseq(RMATCH(match)->str, 0, BEG(0));
1397     if (OBJ_TAINTED(match)) OBJ_TAINT(str);
1398     return str;
1399 }
1400
1401
1402 /*
1403  *  call-seq:
1404  *     mtch.post_match   => str
1405  *
1406  *  Returns the portion of the original string after the current match.
1407  *  Equivalent to the special variable <code>$'</code>.
1408  *
1409  *     m = /(.)(.)(\d+)(\d)/.match("THX1138: The Movie")
1410  *     m.post_match   #=> ": The Movie"
1411  */
1412
1413 VALUE
1414 rb_reg_match_post(VALUE match)
1415 {
1416     VALUE str;
1417     long pos;
1418     struct re_registers *regs;
1419
1420     if (NIL_P(match)) return Qnil;
1421     regs = RMATCH_REGS(match);
1422     if (BEG(0) == -1) return Qnil;
1423     str = RMATCH(match)->str;
1424     pos = END(0);
1425     str = rb_str_subseq(str, pos, RSTRING_LEN(str) - pos);
1426     if (OBJ_TAINTED(match)) OBJ_TAINT(str);
1427     return str;
1428 }
1429
1430 VALUE
1431 rb_reg_match_last(VALUE match)
1432 {
1433     int i;
1434     struct re_registers *regs;
1435
1436     if (NIL_P(match)) return Qnil;
1437     regs = RMATCH_REGS(match);
1438     if (BEG(0) == -1) return Qnil;
1439
1440     for (i=regs->num_regs-1; BEG(i) == -1 && i > 0; i--)
1441         ;
1442     if (i == 0) return Qnil;
1443     return rb_reg_nth_match(i, match);
1444 }
1445
1446 static VALUE
1447 last_match_getter(void)
1448 {
1449     return rb_reg_last_match(rb_backref_get());
1450 }
1451
1452 static VALUE
1453 prematch_getter(void)
1454 {
1455     return rb_reg_match_pre(rb_backref_get());
1456 }
1457
1458 static VALUE
1459 postmatch_getter(void)
1460 {
1461     return rb_reg_match_post(rb_backref_get());
1462 }
1463
1464 static VALUE
1465 last_paren_match_getter(void)
1466 {
1467     return rb_reg_match_last(rb_backref_get());
1468 }
1469
1470 static VALUE
1471 match_array(VALUE match, int start)
1472 {
1473     struct re_registers *regs = RMATCH_REGS(match);
1474     VALUE ary = rb_ary_new2(regs->num_regs);
1475     VALUE target = RMATCH(match)->str;
1476     int i;
1477     int taint = OBJ_TAINTED(match);
1478
1479     for (i=start; i<regs->num_regs; i++) {
1480         if (regs->beg[i] == -1) {
1481             rb_ary_push(ary, Qnil);
1482         }
1483         else {
1484             VALUE str = rb_str_subseq(target, regs->beg[i], regs->end[i]-regs->beg[i]);
1485             if (taint) OBJ_TAINT(str);
1486             rb_ary_push(ary, str);
1487         }
1488     }
1489     return ary;
1490 }
1491
1492
1493 /* [MG]:FIXME: I put parens around the /.../.match() in the first line of the
1494    second example to prevent the '*' followed by a '/' from ending the
1495    comment. */
1496
1497 /*
1498  *  call-seq:
1499  *     mtch.to_a   => anArray
1500  *
1501  *  Returns the array of matches.
1502  *
1503  *     m = /(.)(.)(\d+)(\d)/.match("THX1138.")
1504  *     m.to_a   #=> ["HX1138", "H", "X", "113", "8"]
1505  *
1506  *  Because <code>to_a</code> is called when expanding
1507  *  <code>*</code><em>variable</em>, there's a useful assignment
1508  *  shortcut for extracting matched fields. This is slightly slower than
1509  *  accessing the fields directly (as an intermediate array is
1510  *  generated).
1511  *
1512  *     all,f1,f2,f3 = *(/(.)(.)(\d+)(\d)/.match("THX1138."))
1513  *     all   #=> "HX1138"
1514  *     f1    #=> "H"
1515  *     f2    #=> "X"
1516  *     f3    #=> "113"
1517  */
1518
1519 static VALUE
1520 match_to_a(VALUE match)
1521 {
1522     return match_array(match, 0);
1523 }
1524
1525
1526 /*
1527  *  call-seq:
1528  *     mtch.captures   => array
1529  *
1530  *  Returns the array of captures; equivalent to <code>mtch.to_a[1..-1]</code>.
1531  *
1532  *     f1,f2,f3,f4 = /(.)(.)(\d+)(\d)/.match("THX1138.").captures
1533  *     f1    #=> "H"
1534  *     f2    #=> "X"
1535  *     f3    #=> "113"
1536  *     f4    #=> "8"
1537  */
1538 static VALUE
1539 match_captures(VALUE match)
1540 {
1541     return match_array(match, 1);
1542 }
1543
1544 static int
1545 name_to_backref_number(struct re_registers *regs, VALUE regexp, const char* name, const char* name_end)
1546 {
1547   int num;
1548
1549   num = onig_name_to_backref_number(RREGEXP(regexp)->ptr,
1550             (const unsigned char* )name, (const unsigned char* )name_end, regs);
1551   if (num >= 1) {
1552     return num;
1553   }
1554   else {
1555     VALUE s = rb_str_new(name, (long )(name_end - name));
1556     rb_raise(rb_eIndexError, "undefined group name reference: %s",
1557                              StringValuePtr(s));
1558   }
1559 }
1560
1561 /*
1562  *  call-seq:
1563  *     mtch[i]               => str or nil
1564  *     mtch[start, length]   => array
1565  *     mtch[range]           => array
1566  *     mtch[name]            => str or nil
1567  *
1568  *  Match Reference---<code>MatchData</code> acts as an array, and may be
1569  *  accessed using the normal array indexing techniques.  <i>mtch</i>[0] is
1570  *  equivalent to the special variable <code>$&</code>, and returns the entire
1571  *  matched string.  <i>mtch</i>[1], <i>mtch</i>[2], and so on return the values
1572  *  of the matched backreferences (portions of the pattern between parentheses).
1573  *
1574  *     m = /(.)(.)(\d+)(\d)/.match("THX1138.")
1575  *     m          #=> #<MatchData "HX1138" 1:"H" 2:"X" 3:"113" 4:"8">
1576  *     m[0]       #=> "HX1138"
1577  *     m[1, 2]    #=> ["H", "X"]
1578  *     m[1..3]    #=> ["H", "X", "113"]
1579  *     m[-3, 2]   #=> ["X", "113"]
1580  *
1581  *     m = /(?<foo>a+)b/.match("ccaaab")
1582  *     m          #=> #<MatchData "aaab" foo:"aaa">
1583  *     m["foo"]   #=> "aaa"
1584  *     m[:foo]    #=> "aaa"
1585  */
1586
1587 static VALUE
1588 match_aref(int argc, VALUE *argv, VALUE match)
1589 {
1590     VALUE idx, rest;
1591
1592     rb_scan_args(argc, argv, "11", &idx, &rest);
1593
1594     if (NIL_P(rest)) {
1595       if (FIXNUM_P(idx)) {
1596         if (FIX2INT(idx) >= 0) {
1597           return rb_reg_nth_match(FIX2INT(idx), match);
1598         }
1599       }
1600       else {
1601         const char *p;
1602         int num;
1603
1604         switch (TYPE(idx)) {
1605           case T_SYMBOL:
1606             p = rb_id2name(SYM2ID(idx));
1607             goto name_to_backref;
1608             break;
1609           case T_STRING:
1610             p = StringValuePtr(idx);
1611
1612           name_to_backref:
1613             num = name_to_backref_number(RMATCH_REGS(match),
1614                        RMATCH(match)->regexp, p, p + strlen(p));
1615             return rb_reg_nth_match(num, match);
1616             break;
1617
1618           default:
1619             break;
1620         }
1621       }
1622     }
1623
1624     return rb_ary_aref(argc, argv, match_to_a(match));
1625 }
1626
1627 static VALUE
1628 match_entry(VALUE match, long n)
1629 {
1630     return rb_reg_nth_match(n, match);
1631 }
1632
1633
1634 /*
1635  *  call-seq:
1636  *
1637  *     mtch.values_at([index]*)   => array
1638  *
1639  *  Uses each <i>index</i> to access the matching values, returning an array of
1640  *  the corresponding matches.
1641  *
1642  *     m = /(.)(.)(\d+)(\d)/.match("THX1138: The Movie")
1643  *     m.to_a               #=> ["HX1138", "H", "X", "113", "8"]
1644  *     m.values_at(0, 2, -2)   #=> ["HX1138", "X", "113"]
1645  */
1646
1647 static VALUE
1648 match_values_at(int argc, VALUE *argv, VALUE match)
1649 {
1650     struct re_registers *regs = RMATCH_REGS(match);
1651     return rb_get_values_at(match, regs->num_regs, argc, argv, match_entry);
1652 }
1653
1654
1655 /*
1656  *  call-seq:
1657  *     mtch.to_s   => str
1658  *
1659  *  Returns the entire matched string.
1660  *
1661  *     m = /(.)(.)(\d+)(\d)/.match("THX1138.")
1662  *     m.to_s   #=> "HX1138"
1663  */
1664
1665 static VALUE
1666 match_to_s(VALUE match)
1667 {
1668     VALUE str = rb_reg_last_match(match);
1669
1670     if (NIL_P(str)) str = rb_str_new(0,0);
1671     if (OBJ_TAINTED(match)) OBJ_TAINT(str);
1672     if (OBJ_TAINTED(RMATCH(match)->str)) OBJ_TAINT(str);
1673     return str;
1674 }
1675
1676
1677 /*
1678  *  call-seq:
1679  *     mtch.string   => str
1680  *
1681  *  Returns a frozen copy of the string passed in to <code>match</code>.
1682  *
1683  *     m = /(.)(.)(\d+)(\d)/.match("THX1138.")
1684  *     m.string   #=> "THX1138."
1685  */
1686
1687 static VALUE
1688 match_string(VALUE match)
1689 {
1690     return RMATCH(match)->str;  /* str is frozen */
1691 }
1692
1693 struct backref_name_tag {
1694     const UChar *name;
1695     long len;
1696 };
1697
1698 static int
1699 match_inspect_name_iter(const OnigUChar *name, const OnigUChar *name_end,
1700           int back_num, int *back_refs, OnigRegex regex, void *arg0)
1701 {
1702     struct backref_name_tag *arg = (struct backref_name_tag *)arg0;
1703     int i;
1704
1705     for (i = 0; i < back_num; i++) {
1706         arg[back_refs[i]].name = name;
1707         arg[back_refs[i]].len = name_end - name;
1708     }
1709     return 0;
1710 }
1711
1712 /*
1713  * call-seq:
1714  *    mtch.inspect   => str
1715  *
1716  * Returns a printable version of <i>mtch</i>.
1717  *
1718  *     puts /.$/.match("foo").inspect
1719  *     #=> #<MatchData "o">
1720  *
1721  *     puts /(.)(.)(.)/.match("foo").inspect
1722  *     #=> #<MatchData "foo" 1:"f" 2:"o" 3:"o">
1723  *
1724  *     puts /(.)(.)?(.)/.match("fo").inspect
1725  *     #=> #<MatchData "fo" 1:"f" 2:nil 3:"o">
1726  *
1727  *     puts /(?<foo>.)(?<bar>.)(?<baz>.)/.match("hoge").inspect
1728  *     #=> #<MatchData "hog" foo:"h" bar:"o" baz:"g">
1729  *
1730  */
1731
1732 static VALUE
1733 match_inspect(VALUE match)
1734 {
1735     char *cname = rb_obj_classname(match);
1736     VALUE str;
1737     int i;
1738     struct re_registers *regs = RMATCH_REGS(match);
1739     int num_regs = regs->num_regs;
1740     struct backref_name_tag *names;
1741     VALUE regexp = RMATCH(match)->regexp;
1742
1743     if (regexp == 0) {
1744         return rb_sprintf("#<%s:%p>", cname, (void*)match);
1745     }
1746
1747     names = ALLOCA_N(struct backref_name_tag, num_regs);
1748     MEMZERO(names, struct backref_name_tag, num_regs);
1749
1750     onig_foreach_name(RREGEXP(regexp)->ptr,
1751             match_inspect_name_iter, names);
1752
1753     str = rb_str_buf_new2("#<");
1754     rb_str_buf_cat2(str, cname);
1755
1756     for (i = 0; i < num_regs; i++) {
1757         VALUE v;
1758         rb_str_buf_cat2(str, " ");
1759         if (0 < i) {
1760             if (names[i].name)
1761                 rb_str_buf_cat(str, (const char *)names[i].name, names[i].len);
1762             else {
1763                 char buf[sizeof(i)*3+1];
1764                 snprintf(buf, sizeof(buf), "%d", i);
1765                 rb_str_buf_cat2(str, buf);
1766             }
1767             rb_str_buf_cat2(str, ":");
1768         }
1769         v = rb_reg_nth_match(i, match);
1770         if (v == Qnil)
1771             rb_str_buf_cat2(str, "nil");
1772         else
1773             rb_str_buf_append(str, rb_str_inspect(v));
1774     }
1775     rb_str_buf_cat2(str, ">");
1776
1777     return str;
1778 }
1779
1780 VALUE rb_cRegexp;
1781
1782 static int
1783 read_escaped_byte(const char **pp, const char *end, onig_errmsg_buffer err)
1784 {
1785     const char *p = *pp;
1786     int code;
1787     int meta_prefix = 0, ctrl_prefix = 0;
1788     int len;
1789     int retbyte;
1790
1791     retbyte = -1;
1792     if (p == end || *p++ != '\\') {
1793         strcpy(err, "too short escaped multibyte character");
1794         return -1;
1795     }
1796
1797 again:
1798     if (p == end) {
1799         strcpy(err, "too short escape sequence");
1800         return -1;
1801     }
1802     switch (*p++) {
1803       case '\\': code = '\\'; break;
1804       case 'n': code = '\n'; break;
1805       case 't': code = '\t'; break;
1806       case 'r': code = '\r'; break;
1807       case 'f': code = '\f'; break;
1808       case 'v': code = '\013'; break;
1809       case 'a': code = '\007'; break;
1810       case 'e': code = '\033'; break;
1811
1812       /* \OOO */
1813       case '0': case '1': case '2': case '3':
1814       case '4': case '5': case '6': case '7':
1815         p--;
1816         code = ruby_scan_oct(p, end < p+3 ? end-p : 3, &len);
1817         p += len;
1818         break;
1819
1820       case 'x': /* \xHH */
1821         code = ruby_scan_hex(p, end < p+2 ? end-p : 2, &len);
1822         if (len < 1) {
1823             strcpy(err, "invalid hex escape");
1824             return -1;
1825         }
1826         p += len;
1827         break;
1828
1829       case 'M': /* \M-X, \M-\C-X, \M-\cX */
1830         if (meta_prefix) {
1831             strcpy(err, "duplicate meta escape");
1832             return -1;
1833         }
1834         meta_prefix = 1;
1835         if (p+1 < end && *p++ == '-' && (*p & 0x80) == 0) {
1836             if (*p == '\\') {
1837                 p++;
1838                 goto again;
1839             }
1840             else {
1841                 code = *p++;
1842                 break;
1843             }
1844         }
1845         strcpy(err, "too short meta escape");
1846         return -1;
1847
1848       case 'C': /* \C-X, \C-\M-X */
1849         if (p == end || *p++ != '-') {
1850             strcpy(err, "too short control escape");
1851             return -1;
1852         }
1853       case 'c': /* \cX, \c\M-X */
1854         if (ctrl_prefix) {
1855             strcpy(err, "duplicate control escape");
1856             return -1;
1857         }
1858         ctrl_prefix = 1;
1859         if (p < end && (*p & 0x80) == 0) {
1860             if (*p == '\\') {
1861                 p++;
1862                 goto again;
1863             }
1864             else {
1865                 code = *p++;
1866                 break;
1867             }
1868         }
1869         strcpy(err, "too short control escape");
1870         return -1;
1871
1872       default:
1873         strcpy(err, "unexpected escape sequence");
1874         return -1;
1875     }
1876     if (code < 0 || 0xff < code) {
1877         strcpy(err, "invalid escape code");
1878         return -1;
1879     }
1880
1881     if (ctrl_prefix)
1882         code &= 0x1f;
1883     if (meta_prefix)
1884         code |= 0x80;
1885
1886     *pp = p;
1887     return code;
1888 }
1889
1890 static int
1891 unescape_escaped_nonascii(const char **pp, const char *end, rb_encoding *enc,
1892         VALUE buf, rb_encoding **encp, onig_errmsg_buffer err)
1893 {
1894     const char *p = *pp;
1895     int chmaxlen = rb_enc_mbmaxlen(enc);
1896     char *chbuf = ALLOCA_N(char, chmaxlen);
1897     int chlen = 0;
1898     int byte;
1899     int l;
1900
1901     memset(chbuf, 0, chmaxlen);
1902
1903     byte = read_escaped_byte(&p, end, err);
1904     if (byte == -1) {
1905         return -1;
1906     }
1907
1908     chbuf[chlen++] = byte;
1909     while (chlen < chmaxlen &&
1910            MBCLEN_NEEDMORE_P(rb_enc_precise_mbclen(chbuf, chbuf+chlen, enc))) {
1911         byte = read_escaped_byte(&p, end, err);
1912         if (byte == -1) {
1913             return -1;
1914         }
1915         chbuf[chlen++] = byte;
1916     }
1917
1918     l = rb_enc_precise_mbclen(chbuf, chbuf+chlen, enc);
1919     if (MBCLEN_INVALID_P(l)) {
1920         strcpy(err, "invalid multibyte escape");
1921         return -1;
1922     }
1923     if (1 < chlen || (chbuf[0] & 0x80)) {
1924         rb_str_buf_cat(buf, chbuf, chlen);
1925
1926         if (*encp == 0)
1927             *encp = enc;
1928         else if (*encp != enc) {
1929             strcpy(err, "escaped non ASCII character in UTF-8 regexp");
1930             return -1;
1931         }
1932     }
1933     else {
1934         char escbuf[5];
1935         snprintf(escbuf, sizeof(escbuf), "\\x%02X", chbuf[0]&0xff);
1936         rb_str_buf_cat(buf, escbuf, 4);
1937     }
1938     *pp = p;
1939     return 0;
1940 }
1941
1942 static int
1943 check_unicode_range(unsigned long code, onig_errmsg_buffer err)
1944 {
1945     if ((0xd800 <= code && code <= 0xdfff) || /* Surrogates */
1946         0x10ffff < code) {
1947         strcpy(err, "invalid Unicode range");
1948         return -1;
1949     }
1950     return 0;
1951 }
1952
1953 static int
1954 append_utf8(unsigned long uv,
1955         VALUE buf, rb_encoding **encp, onig_errmsg_buffer err)
1956 {
1957     if (check_unicode_range(uv, err) != 0)
1958         return -1;
1959     if (uv < 0x80) {
1960         char escbuf[5];
1961         snprintf(escbuf, sizeof(escbuf), "\\x%02X", (int)uv);
1962         rb_str_buf_cat(buf, escbuf, 4);
1963     }
1964     else {
1965         int len;
1966         char utf8buf[6];
1967         len = rb_uv_to_utf8(utf8buf, uv);
1968         rb_str_buf_cat(buf, utf8buf, len);
1969
1970         if (*encp == 0)
1971             *encp = rb_utf8_encoding();
1972         else if (*encp != rb_utf8_encoding()) {
1973             strcpy(err, "UTF-8 character in non UTF-8 regexp");
1974             return -1;
1975         }
1976     }
1977     return 0;
1978 }
1979
1980 static int
1981 unescape_unicode_list(const char **pp, const char *end,
1982         VALUE buf, rb_encoding **encp, onig_errmsg_buffer err)
1983 {
1984     const char *p = *pp;
1985     int has_unicode = 0;
1986     unsigned long code;
1987     int len;
1988
1989     while (p < end && ISSPACE(*p)) p++;
1990
1991     while (1) {
1992         code = ruby_scan_hex(p, end-p, &len);
1993         if (len == 0)
1994             break;
1995         if (6 < len) { /* max 10FFFF */
1996             strcpy(err, "invalid Unicode range");
1997             return -1;
1998         }
1999         p += len;
2000         if (append_utf8(code, buf, encp, err) != 0)
2001             return -1;
2002         has_unicode = 1;
2003
2004         while (p < end && ISSPACE(*p)) p++;
2005     }
2006
2007     if (has_unicode == 0) {
2008         strcpy(err, "invalid Unicode list");
2009         return -1;
2010     }
2011
2012     *pp = p;
2013
2014     return 0;
2015 }
2016
2017 static int
2018 unescape_unicode_bmp(const char **pp, const char *end,
2019         VALUE buf, rb_encoding **encp, onig_errmsg_buffer err)
2020 {
2021     const char *p = *pp;
2022     int len;
2023     unsigned long code;
2024
2025     if (end < p+4) {
2026         strcpy(err, "invalid Unicode escape");
2027         return -1;
2028     }
2029     code = ruby_scan_hex(p, 4, &len);
2030     if (len != 4) {
2031         strcpy(err, "invalid Unicode escape");
2032         return -1;
2033     }
2034     if (append_utf8(code, buf, encp, err) != 0)
2035         return -1;
2036     *pp = p + 4;
2037     return 0;
2038 }
2039
2040 static int
2041 unescape_nonascii(const char *p, const char *end, rb_encoding *enc,
2042         VALUE buf, rb_encoding **encp, onig_errmsg_buffer err)
2043 {
2044     char c;
2045     char smallbuf[2];
2046
2047     while (p < end) {
2048         int chlen = rb_enc_precise_mbclen(p, end, enc);
2049         if (!MBCLEN_CHARFOUND_P(chlen)) {
2050             strcpy(err, "invalid multibyte character");
2051             return -1;
2052         }
2053         chlen = MBCLEN_CHARFOUND_LEN(chlen);
2054         if (1 < chlen || (*p & 0x80)) {
2055             rb_str_buf_cat(buf, p, chlen);
2056             p += chlen;
2057             if (*encp == 0)
2058                 *encp = enc;
2059             else if (*encp != enc) {
2060                 strcpy(err, "non ASCII character in UTF-8 regexp");
2061                 return -1;
2062             }
2063             continue;
2064         }
2065
2066         switch (c = *p++) {
2067           case '\\':
2068             if (p == end) {
2069                 strcpy(err, "too short escape sequence");
2070                 return -1;
2071             }
2072             switch (c = *p++) {
2073               case '1': case '2': case '3':
2074               case '4': case '5': case '6': case '7': /* \O, \OO, \OOO or backref */
2075                 {
2076                     int octlen;
2077                     if (ruby_scan_oct(p-1, end-(p-1), &octlen) <= 0177) {
2078                         /* backref or 7bit octal.
2079                            no need to unescape anyway.
2080                            re-escaping may break backref */
2081                         goto escape_asis;
2082                     }
2083                 }
2084                 /* xxx: How about more than 199 subexpressions? */
2085
2086               case '0': /* \0, \0O, \0OO */
2087
2088               case 'x': /* \xHH */
2089               case 'c': /* \cX, \c\M-X */
2090               case 'C': /* \C-X, \C-\M-X */
2091               case 'M': /* \M-X, \M-\C-X, \M-\cX */
2092                 p = p-2;
2093                 if (unescape_escaped_nonascii(&p, end, enc, buf, encp, err) != 0)
2094                     return -1;
2095                 break;
2096
2097               case 'u':
2098                 if (p == end) {
2099                     strcpy(err, "too short escape sequence");
2100                     return -1;
2101                 }
2102                 if (*p == '{') {
2103                     /* \u{H HH HHH HHHH HHHHH HHHHHH ...} */
2104                     p++;
2105                     if (unescape_unicode_list(&p, end, buf, encp, err) != 0)
2106                         return -1;
2107                     if (p == end || *p++ != '}') {
2108                         strcpy(err, "invalid Unicode list");
2109                         return -1;
2110                     }
2111                     break;
2112                 }
2113                 else {
2114                     /* \uHHHH */
2115                     if (unescape_unicode_bmp(&p, end, buf, encp, err) != 0)
2116                         return -1;
2117                     break;
2118                 }
2119
2120               default: /* \n, \\, \d, \9, etc. */
2121 escape_asis:
2122                 smallbuf[0] = '\\';
2123                 smallbuf[1] = c;
2124                 rb_str_buf_cat(buf, smallbuf, 2);
2125                 break;
2126             }
2127             break;
2128
2129           default:
2130             rb_str_buf_cat(buf, &c, 1);
2131             break;
2132         }
2133     }
2134
2135     return 0;
2136 }
2137
2138 static VALUE
2139 rb_reg_preprocess(const char *p, const char *end, rb_encoding *enc,
2140         rb_encoding **fixed_enc, onig_errmsg_buffer err)
2141 {
2142     VALUE buf;
2143
2144     buf = rb_str_buf_new(0);
2145
2146     if (rb_enc_asciicompat(enc))
2147         *fixed_enc = 0;
2148     else {
2149         *fixed_enc = enc;
2150         rb_enc_associate(buf, enc);
2151     }
2152
2153     if (unescape_nonascii(p, end, enc, buf, fixed_enc, err) != 0)
2154         return Qnil;
2155
2156     if (*fixed_enc) {
2157         rb_enc_associate(buf, *fixed_enc);
2158     }
2159
2160     return buf;
2161 }
2162
2163 VALUE
2164 rb_reg_check_preprocess(VALUE str)
2165 {
2166     rb_encoding *fixed_enc = 0;
2167     onig_errmsg_buffer err = "";
2168     VALUE buf;
2169     char *p, *end;
2170     rb_encoding *enc;
2171
2172     StringValue(str);
2173     p = RSTRING_PTR(str);
2174     end = p + RSTRING_LEN(str);
2175     enc = rb_enc_get(str);
2176
2177     buf = rb_reg_preprocess(p, end, enc, &fixed_enc, err);
2178     RB_GC_GUARD(str);
2179
2180     if (buf == Qnil) {
2181         return rb_reg_error_desc(str, 0, err);
2182     }
2183     return Qnil;
2184 }
2185
2186 static VALUE
2187 rb_reg_preprocess_dregexp(VALUE ary)
2188 {
2189     rb_encoding *fixed_enc = 0;
2190     rb_encoding *regexp_enc = 0;
2191     onig_errmsg_buffer err = "";
2192     int i;
2193     VALUE result = 0;
2194     int argc = RARRAY_LEN(ary);
2195     VALUE *argv = RARRAY_PTR(ary);
2196
2197     if (argc == 0) {
2198         rb_raise(rb_eArgError, "no arguments given");
2199     }
2200
2201     for (i = 0; i < argc; i++) {
2202         VALUE str = argv[i];
2203         VALUE buf;
2204         char *p, *end;
2205         rb_encoding *src_enc;
2206
2207         StringValue(str);
2208         p = RSTRING_PTR(str);
2209         end = p + RSTRING_LEN(str);
2210         src_enc = rb_enc_get(str);
2211
2212         buf = rb_reg_preprocess(p, end, src_enc, &fixed_enc, err);
2213
2214         if (buf == Qnil)
2215             rb_raise(rb_eArgError, "%s", err);
2216
2217         if (fixed_enc != 0) {
2218             if (regexp_enc != 0 && regexp_enc != fixed_enc) {
2219                 rb_raise(rb_eArgError, "encoding mismatch in dynamic regexp : %s and %s",
2220                          rb_enc_name(regexp_enc), rb_enc_name(fixed_enc));
2221             }
2222             regexp_enc = fixed_enc;
2223         }
2224
2225         if (!result)
2226             result = rb_str_new3(str);
2227         else
2228             rb_str_buf_append(result, str);
2229     }
2230     if (regexp_enc) {
2231         rb_enc_associate(result, regexp_enc);
2232     }
2233
2234     return result;
2235 }
2236
2237 static int
2238 rb_reg_initialize(VALUE obj, const char *s, int len, rb_encoding *enc,
2239                   int options, onig_errmsg_buffer err)
2240 {
2241     struct RRegexp *re = RREGEXP(obj);
2242     VALUE unescaped;
2243     rb_encoding *fixed_enc = 0;
2244     rb_encoding *a_enc = rb_ascii8bit_encoding();
2245
2246     if (!OBJ_TAINTED(obj) && rb_safe_level() >= 4)
2247         rb_raise(rb_eSecurityError, "Insecure: can't modify regexp");
2248     rb_check_frozen(obj);
2249     if (FL_TEST(obj, REG_LITERAL))
2250         rb_raise(rb_eSecurityError, "can't modify literal regexp");
2251     if (re->ptr) onig_free(re->ptr);
2252     if (re->str) free(re->str);
2253     re->ptr = 0;
2254     re->str = 0;
2255
2256     unescaped = rb_reg_preprocess(s, s+len, enc, &fixed_enc, err);
2257     if (unescaped == Qnil)
2258         return -1;
2259
2260     if (fixed_enc) {
2261         if ((fixed_enc != enc && (options & ARG_ENCODING_FIXED)) ||
2262             (fixed_enc != a_enc && (options & ARG_ENCODING_NONE))) {
2263             strcpy(err, "incompatible character encoding");
2264             return -1;
2265         }
2266         if (fixed_enc != a_enc) {
2267             options |= ARG_ENCODING_FIXED;
2268             enc = fixed_enc;
2269         }
2270     }
2271     else if (!(options & ARG_ENCODING_FIXED)) {
2272        enc = rb_usascii_encoding();
2273     }
2274
2275     rb_enc_associate((VALUE)re, enc);
2276     if ((options & ARG_ENCODING_FIXED) || fixed_enc) {
2277         re->basic.flags |= KCODE_FIXED;
2278     }
2279     if (options & ARG_ENCODING_NONE) {
2280         re->basic.flags |= REG_ENCODING_NONE;
2281     }
2282
2283     re->ptr = make_regexp(RSTRING_PTR(unescaped), RSTRING_LEN(unescaped), enc,
2284             options & ARG_REG_OPTION_MASK, err);
2285     if (!re->ptr) return -1;
2286     re->str = ALLOC_N(char, len+1);
2287     memcpy(re->str, s, len);
2288     re->str[len] = '\0';
2289     re->len = len;
2290     RB_GC_GUARD(unescaped);
2291     return 0;
2292 }
2293
2294 static int
2295 rb_reg_initialize_str(VALUE obj, VALUE str, int options, onig_errmsg_buffer err)
2296 {
2297     int ret;
2298     rb_encoding *enc = rb_enc_get(str);
2299     if (options & ARG_ENCODING_NONE) {
2300         rb_encoding *ascii8bit = rb_ascii8bit_encoding();
2301         if (enc != ascii8bit) {
2302             if (rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT) {
2303                 strcpy(err, "/.../n has a non escaped non ASCII character in non ASCII-8BIT script");
2304                 return -1;
2305             }
2306             enc = ascii8bit;
2307         }
2308     }
2309     ret = rb_reg_initialize(obj, RSTRING_PTR(str), RSTRING_LEN(str), enc,
2310                             options, err);
2311     RB_GC_GUARD(str);
2312     return ret;
2313 }
2314
2315 static VALUE
2316 rb_reg_s_alloc(VALUE klass)
2317 {
2318     NEWOBJ(re, struct RRegexp);
2319     OBJSETUP(re, klass, T_REGEXP);
2320
2321     re->ptr = 0;
2322     re->len = 0;
2323     re->str = 0;
2324
2325     return (VALUE)re;
2326 }
2327
2328 VALUE
2329 rb_reg_new_str(VALUE s, int options)
2330 {
2331     VALUE re = rb_reg_s_alloc(rb_cRegexp);
2332     onig_errmsg_buffer err = "";
2333
2334     if (rb_reg_initialize_str(re, s, options, err) != 0) {
2335         rb_reg_raise_str(s, options, err);
2336     }
2337
2338     return re;
2339 }
2340
2341 VALUE
2342 rb_reg_new_ary(VALUE ary, int opt)
2343 {
2344     return rb_reg_new_str(rb_reg_preprocess_dregexp(ary), opt);
2345 }
2346
2347 VALUE
2348 rb_enc_reg_new(const char *s, long len, rb_encoding *enc, int options)
2349 {
2350     VALUE re = rb_reg_s_alloc(rb_cRegexp);
2351     onig_errmsg_buffer err = "";
2352
2353     if (rb_reg_initialize(re, s, len, enc, options, err) != 0) {
2354         rb_enc_reg_raise(s, len, enc, options, err);
2355     }
2356
2357     return re;
2358 }
2359
2360 VALUE
2361 rb_reg_new(const char *s, long len, int options)
2362 {
2363     return rb_enc_reg_new(s, len, rb_ascii8bit_encoding(), options);
2364 }
2365
2366 VALUE
2367 rb_reg_compile(VALUE str, int options)
2368 {
2369     VALUE re = rb_reg_s_alloc(rb_cRegexp);
2370     onig_errmsg_buffer err = "";
2371
2372     if (!str) str = rb_str_new(0,0);
2373     if (rb_reg_initialize_str(re, str, options, err) != 0) {
2374         rb_set_errinfo(rb_reg_error_desc(str, options, err));
2375         return Qnil;
2376     }
2377     FL_SET(re, REG_LITERAL);
2378     return re;
2379 }
2380
2381 static VALUE reg_cache;
2382
2383 VALUE
2384 rb_reg_regcomp(VALUE str)
2385 {
2386     volatile VALUE save_str = str;
2387     if (reg_cache && RREGEXP(reg_cache)->len == RSTRING_LEN(str)
2388         && ENCODING_GET(reg_cache) == ENCODING_GET(str)
2389         && memcmp(RREGEXP(reg_cache)->str, RSTRING_PTR(str), RSTRING_LEN(str)) == 0)
2390         return reg_cache;
2391
2392     return reg_cache = rb_reg_new_str(save_str, 0);
2393 }
2394
2395 /*
2396  * call-seq:
2397  *   rxp.hash   => fixnum
2398  *
2399  * Produce a hash based on the text and options of this regular expression.
2400  */
2401
2402 static VALUE
2403 rb_reg_hash(VALUE re)
2404 {
2405     int hashval, len;
2406     char *p;
2407
2408     rb_reg_check(re);
2409     hashval = RREGEXP(re)->ptr->options;
2410     len = RREGEXP(re)->len;
2411     p  = RREGEXP(re)->str;
2412     while (len--) {
2413         hashval = hashval * 33 + *p++;
2414     }
2415     hashval = hashval + (hashval>>5);
2416
2417     return INT2FIX(hashval);
2418 }
2419
2420
2421 /*
2422  *  call-seq:
2423  *     rxp == other_rxp      => true or false
2424  *     rxp.eql?(other_rxp)   => true or false
2425  *
2426  *  Equality---Two regexps are equal if their patterns are identical, they have
2427  *  the same character set code, and their <code>casefold?</code> values are the
2428  *  same.
2429  *
2430  *     /abc/  == /abc/x   #=> false
2431  *     /abc/  == /abc/i   #=> false
2432  *     /abc/  == /abc/n   #=> false
2433  *     /abc/u == /abc/n   #=> false
2434  */
2435
2436 static VALUE
2437 rb_reg_equal(VALUE re1, VALUE re2)
2438 {
2439     if (re1 == re2) return Qtrue;
2440     if (TYPE(re2) != T_REGEXP) return Qfalse;
2441     rb_reg_check(re1); rb_reg_check(re2);
2442     if (FL_TEST(re1, KCODE_FIXED) != FL_TEST(re2, KCODE_FIXED)) return Qfalse;
2443     if (RREGEXP(re1)->ptr->options != RREGEXP(re2)->ptr->options) return Qfalse;
2444     if (RREGEXP(re1)->len != RREGEXP(re2)->len) return Qfalse;
2445     if (ENCODING_GET(re1) != ENCODING_GET(re2)) return Qfalse;
2446     if (memcmp(RREGEXP(re1)->str, RREGEXP(re2)->str, RREGEXP(re1)->len) == 0) {
2447         return Qtrue;
2448     }
2449     return Qfalse;
2450 }
2451
2452 static VALUE
2453 reg_operand(VALUE s, int check)
2454 {
2455     if (SYMBOL_P(s)) {
2456         return rb_sym_to_s(s);
2457     }
2458     else {
2459         VALUE tmp = rb_check_string_type(s);
2460         if (check && NIL_P(tmp)) {
2461             rb_raise(rb_eTypeError, "can't convert %s to String",
2462                      rb_obj_classname(s));
2463         }
2464         return tmp;
2465     }
2466 }
2467
2468 static long
2469 reg_match_pos(VALUE re, VALUE *strp, long pos)
2470 {
2471     VALUE str = *strp;
2472
2473     if (NIL_P(str)) {
2474         rb_backref_set(Qnil);
2475         return -1;
2476     }
2477     *strp = str = reg_operand(str, Qtrue);
2478     if (pos != 0) {
2479         if (pos < 0) {
2480             VALUE l = rb_str_length(str);
2481             pos += NUM2INT(l);
2482             if (pos < 0) {
2483                 return pos;
2484             }
2485         }
2486         pos = rb_reg_adjust_startpos(re, str, pos, 0);
2487     }
2488     return rb_reg_search(re, str, pos, 0);
2489 }
2490
2491 /*
2492  *  call-seq:
2493  *     rxp =~ str    => integer or nil
2494  *
2495  *  Match---Matches <i>rxp</i> against <i>str</i>.
2496  *
2497  *     /at/ =~ "input data"   #=> 7
2498  *     /ax/ =~ "input data"   #=> nil
2499  *
2500  *  If <code>=~</code> is used with a regexp literal with named captures,
2501  *  captured strings (or nil) is assigned to local variables named by
2502  *  the capture names.
2503  *
2504  *     /(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/ =~ "  x = y  "
2505  *     p lhs    #=> "x"
2506  *     p rhs    #=> "y"
2507  *
2508  *  If it is not matched, nil is assigned for the variables.
2509  *
2510  *     /(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/ =~ "  x = "
2511  *     p lhs    #=> nil
2512  *     p rhs    #=> nil
2513  *
2514  *  This assignment is implemented in the Ruby parser.
2515  *  So a regexp literal is required for the assignment.
2516  *  The assignment is not occur if the regexp is not a literal.
2517  *
2518  *     re = /(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/
2519  *     re =~ "  x = "
2520  *     p lhs    # undefined local variable
2521  *     p rhs    # undefined local variable
2522  *
2523  *  A regexp interpolation, <code>#{}</code>, also disables
2524  *  the assignment.
2525  *
2526  *     rhs_pat = /(?<rhs>\w+)/
2527  *     /(?<lhs>\w+)\s*=\s*#{rhs_pat}/ =~ "x = y"
2528  *     p lhs    # undefined local variable
2529  *
2530  */
2531
2532 VALUE
2533 rb_reg_match(VALUE re, VALUE str)
2534 {
2535     long pos = reg_match_pos(re, &str, 0);
2536     if (pos < 0) return Qnil;
2537     pos = rb_str_sublen(str, pos);
2538     return LONG2FIX(pos);
2539 }
2540
2541 /*
2542  *  call-seq:
2543  *     rxp === str   => true or false
2544  *
2545  *  Case Equality---Synonym for <code>Regexp#=~</code> used in case statements.
2546  *
2547  *     a = "HELLO"
2548  *     case a
2549  *     when /^[a-z]*$/; print "Lower case\n"
2550  *     when /^[A-Z]*$/; print "Upper case\n"
2551  *     else;            print "Mixed case\n"
2552  *     end
2553  *
2554  *  <em>produces:</em>
2555  *
2556  *     Upper case
2557  */
2558
2559 VALUE
2560 rb_reg_eqq(VALUE re, VALUE str)
2561 {
2562     long start;
2563
2564     str = reg_operand(str, Qfalse);
2565     if (NIL_P(str)) {
2566         rb_backref_set(Qnil);
2567         return Qfalse;
2568     }
2569     start = rb_reg_search(re, str, 0, 0);
2570     if (start < 0) {
2571         return Qfalse;
2572     }
2573     return Qtrue;
2574 }
2575
2576
2577 /*
2578  *  call-seq:
2579  *     ~ rxp   => integer or nil
2580  *
2581  *  Match---Matches <i>rxp</i> against the contents of <code>$_</code>.
2582  *  Equivalent to <code><i>rxp</i> =~ $_</code>.
2583  *
2584  *     $_ = "input data"
2585  *     ~ /at/   #=> 7
2586  */
2587
2588 VALUE
2589 rb_reg_match2(VALUE re)
2590 {
2591     long start;
2592     VALUE line = rb_lastline_get();
2593
2594     if (TYPE(line) != T_STRING) {
2595         rb_backref_set(Qnil);
2596         return Qnil;
2597     }
2598
2599     start = rb_reg_search(re, line, 0, 0);
2600     if (start < 0) {
2601         return Qnil;
2602     }
2603     start = rb_str_sublen(line, start);
2604     return LONG2FIX(start);
2605 }
2606
2607
2608 /*
2609  *  call-seq:
2610  *     rxp.match(str)       => matchdata or nil
2611  *     rxp.match(str,pos)   => matchdata or nil
2612  *
2613  *  Returns a <code>MatchData</code> object describing the match, or
2614  *  <code>nil</code> if there was no match. This is equivalent to retrieving the
2615  *  value of the special variable <code>$~</code> following a normal match.
2616  *  If the second parameter is present, it specifies the position in the string
2617  *  to begin the search.
2618  *
2619  *     /(.)(.)(.)/.match("abc")[2]   #=> "b"
2620  *     /(.)(.)/.match("abc", 1)[2]   #=> "c"
2621  *
2622  *  If a block is given, invoke the block with MatchData if match succeed, so
2623  *  that you can write
2624  *
2625  *     pat.match(str) {|m| ...}
2626  *
2627  *  instead of
2628  *
2629  *     if m = pat.match(str)
2630  *       ...
2631  *     end
2632  *
2633  *  The return value is a value from block execution in this case.
2634  */
2635
2636 static VALUE
2637 rb_reg_match_m(int argc, VALUE *argv, VALUE re)
2638 {
2639     VALUE result, str, initpos;
2640     long pos;
2641
2642     if (rb_scan_args(argc, argv, "11", &str, &initpos) == 2) {
2643         pos = NUM2LONG(initpos);
2644     }
2645     else {
2646         pos = 0;
2647     }
2648
2649     pos = reg_match_pos(re, &str, pos);
2650     if (pos < 0) {
2651         rb_backref_set(Qnil);
2652         return Qnil;
2653     }
2654     result = rb_backref_get();
2655     rb_match_busy(result);
2656     if (!NIL_P(result) && rb_block_given_p()) {
2657         return rb_yield(result);
2658     }
2659     return result;
2660 }
2661
2662 /*
2663  * Document-method: compile
2664  *
2665  * Synonym for <code>Regexp.new</code>
2666  */
2667
2668
2669 /*
2670  *  call-seq:
2671  *     Regexp.new(string [, options])                => regexp
2672  *     Regexp.new(regexp)                            => regexp
2673  *     Regexp.compile(string [, options])            => regexp
2674  *     Regexp.compile(regexp)                        => regexp
2675  *
2676  *  Constructs a new regular expression from <i>pattern</i>, which can be either
2677  *  a <code>String</code> or a <code>Regexp</code> (in which case that regexp's
2678  *  options are propagated, and new options may not be specified (a change as of
2679  *  Ruby 1.8). If <i>options</i> is a <code>Fixnum</code>, it should be one or
2680  *  more of the constants <code>Regexp::EXTENDED</code>,
2681  *  <code>Regexp::IGNORECASE</code>, and <code>Regexp::MULTILINE</code>,
2682  *  <em>or</em>-ed together. Otherwise, if <i>options</i> is not
2683  *  <code>nil</code>, the regexp will be case insensitive.
2684  *
2685  *     r1 = Regexp.new('^a-z+:\\s+\w+')           #=> /^a-z+:\s+\w+/
2686  *     r2 = Regexp.new('cat', true)               #=> /cat/i
2687  *     r3 = Regexp.new('dog', Regexp::EXTENDED)   #=> /dog/x
2688  *     r4 = Regexp.new(r2)                        #=> /cat/i
2689  */
2690
2691 static VALUE
2692 rb_reg_initialize_m(int argc, VALUE *argv, VALUE self)
2693 {
2694     onig_errmsg_buffer err = "";
2695     int flags = 0;
2696     VALUE str;
2697     rb_encoding *enc;
2698     const char *ptr;
2699     long len;
2700
2701     if (argc == 0 || argc > 3) {
2702         rb_raise(rb_eArgError, "wrong number of arguments");
2703     }
2704     if (TYPE(argv[0]) == T_REGEXP) {
2705         VALUE re = argv[0];
2706
2707         if (argc > 1) {
2708             rb_warn("flags ignored");
2709         }
2710         rb_reg_check(re);
2711         flags = rb_reg_options(re);
2712         ptr = RREGEXP(re)->str;
2713         len = RREGEXP(re)->len;
2714         enc = rb_enc_get(re);
2715         if (rb_reg_initialize(self, ptr, len, enc, flags, err)) {
2716             str = rb_enc_str_new(ptr, len, enc);
2717             rb_reg_raise_str(str, flags, err);
2718         }
2719     }
2720     else {
2721         if (argc >= 2) {
2722             if (FIXNUM_P(argv[1])) flags = FIX2INT(argv[1]);
2723             else if (RTEST(argv[1])) flags = ONIG_OPTION_IGNORECASE;
2724         }
2725         enc = 0;
2726         if (argc == 3 && !NIL_P(argv[2])) {
2727             char *kcode = StringValuePtr(argv[2]);
2728             if (kcode[0] == 'n' || kcode[1] == 'N') {
2729                 enc = rb_ascii8bit_encoding();
2730                 flags |= ARG_ENCODING_FIXED;
2731             }
2732             else {
2733                 rb_warning("encoding option is obsolete - %s", kcode);
2734             }
2735         }
2736         str = argv[0];
2737         ptr = StringValuePtr(str);
2738         if (enc
2739             ? rb_reg_initialize(self, ptr, RSTRING_LEN(str), enc, flags, err)
2740             : rb_reg_initialize_str(self, str, flags, err)) {
2741             rb_reg_raise_str(str, flags, err);
2742         }
2743     }
2744     return self;
2745 }
2746
2747 VALUE
2748 rb_reg_quote(VALUE str)
2749 {
2750     rb_encoding *enc = rb_enc_get(str);
2751     char *s, *send, *t;
2752     VALUE tmp;
2753     int c, clen;
2754     int ascii_only = rb_enc_str_asciionly_p(str);
2755
2756     s = RSTRING_PTR(str);
2757     send = s + RSTRING_LEN(str);
2758     while (s < send) {
2759         c = rb_enc_ascget(s, send, &clen, enc);
2760         if (c == -1) {
2761             s += mbclen(s, send, enc);
2762             continue;
2763         }
2764         switch (c) {
2765           case '[': case ']': case '{': case '}':
2766           case '(': case ')': case '|': case '-':
2767           case '*': case '.': case '\\':
2768           case '?': case '+': case '^': case '$':
2769           case ' ': case '#':
2770           case '\t': case '\f': case '\v': case '\n': case '\r':
2771             goto meta_found;
2772         }
2773         s += clen;
2774     }
2775     tmp = rb_str_new3(str);
2776     if (ascii_only) {
2777         rb_enc_associate(tmp, rb_usascii_encoding());
2778     }
2779     return tmp;
2780
2781   meta_found:
2782     tmp = rb_str_new(0, RSTRING_LEN(str)*2);
2783     if (ascii_only) {
2784         rb_enc_associate(tmp, rb_usascii_encoding());
2785     }
2786     else {
2787         rb_enc_copy(tmp, str);
2788     }
2789     t = RSTRING_PTR(tmp);
2790     /* copy upto metacharacter */
2791     memcpy(t, RSTRING_PTR(str), s - RSTRING_PTR(str));
2792     t += s - RSTRING_PTR(str);
2793
2794     while (s < send) {
2795         c = rb_enc_ascget(s, send, &clen, enc);
2796         if (c == -1) {
2797             int n = mbclen(s, send, enc);
2798
2799             while (n--)
2800                 *t++ = *s++;
2801             continue;
2802         }
2803         s += clen;
2804         switch (c) {
2805           case '[': case ']': case '{': case '}':
2806           case '(': case ')': case '|': case '-':
2807           case '*': case '.': case '\\':
2808           case '?': case '+': case '^': case '$':
2809           case '#':
2810             *t++ = '\\';
2811             break;
2812           case ' ':
2813             *t++ = '\\';
2814             *t++ = ' ';
2815             continue;
2816           case '\t':
2817             *t++ = '\\';
2818             *t++ = 't';
2819             continue;
2820           case '\n':
2821             *t++ = '\\';
2822             *t++ = 'n';
2823             continue;
2824           case '\r':
2825             *t++ = '\\';
2826             *t++ = 'r';
2827             continue;
2828           case '\f':
2829             *t++ = '\\';
2830             *t++ = 'f';
2831             continue;
2832           case '\v':
2833             *t++ = '\\';
2834             *t++ = 'v';
2835             continue;
2836         }
2837         *t++ = c;
2838     }
2839     rb_str_resize(tmp, t - RSTRING_PTR(tmp));
2840     OBJ_INFECT(tmp, str);
2841     return tmp;
2842 }
2843
2844
2845 /*
2846  *  call-seq:
2847  *     Regexp.escape(str)   => string
2848  *     Regexp.quote(str)    => string
2849  *
2850  *  Escapes any characters that would have special meaning in a regular
2851  *  expression. Returns a new escaped string, or self if no characters are
2852  *  escaped.  For any string,
2853  *  <code>Regexp.new(Regexp.escape(<i>str</i>))=~<i>str</i></code> will be true.
2854  *
2855  *     Regexp.escape('\*?{}.')   #=> \\\*\?\{\}\.
2856  *
2857  */
2858
2859 static VALUE
2860 rb_reg_s_quote(VALUE c, VALUE str)
2861 {
2862     return rb_reg_quote(reg_operand(str, Qtrue));
2863 }
2864
2865 int
2866 rb_reg_options(VALUE re)
2867 {
2868     int options;
2869
2870     rb_reg_check(re);
2871     options = RREGEXP(re)->ptr->options & ARG_REG_OPTION_MASK;
2872     if (RBASIC(re)->flags & KCODE_FIXED) options |= ARG_ENCODING_FIXED;
2873     if (RBASIC(re)->flags & REG_ENCODING_NONE) options |= ARG_ENCODING_NONE;
2874     return options;
2875 }
2876
2877 VALUE
2878 rb_check_regexp_type(VALUE re)
2879 {
2880     return rb_check_convert_type(re, T_REGEXP, "Regexp", "to_regexp");
2881 }
2882
2883 /*
2884  *  call-seq:
2885  *     Regexp.try_convert(obj) -> re or nil
2886  *
2887  *  Try to convert <i>obj</i> into a Regexp, using to_regexp method.
2888  *  Returns converted regexp or nil if <i>obj</i> cannot be converted
2889  *  for any reason.
2890  *
2891  *     Regexp.try_convert(/re/)         #=> /re/
2892  *     Regexp.try_convert("re")         #=> nil
2893  *
2894  *     o = Object.new
2895  *     Regexp.try_convert(o)            #=> nil
2896  *     def o.to_regexp() /foo/ end
2897  *     Regexp.try_convert(o)            #=> /foo/
2898  *
2899  */
2900 static VALUE
2901 rb_reg_s_try_convert(VALUE dummy, VALUE re)
2902 {
2903     return rb_check_regexp_type(re);
2904 }
2905
2906 static VALUE
2907 rb_reg_s_union(VALUE self, VALUE args0)
2908 {
2909     long argc = RARRAY_LEN(args0);
2910
2911     if (argc == 0) {
2912         VALUE args[1];
2913         args[0] = rb_str_new2("(?!)");
2914         return rb_class_new_instance(1, args, rb_cRegexp);
2915     }
2916     else if (argc == 1) {
2917         VALUE arg = rb_ary_entry(args0, 0);
2918         VALUE re = rb_check_regexp_type(arg);
2919         if (!NIL_P(re))
2920             return re;
2921         else {
2922             VALUE quoted;
2923             quoted = rb_reg_s_quote(Qnil, arg);
2924             return rb_reg_new_str(quoted, 0);
2925         }
2926     }
2927     else {
2928         int i;
2929         VALUE source = rb_str_buf_new(0);
2930         rb_encoding *result_enc;
2931
2932         int has_asciionly = 0;
2933         rb_encoding *has_ascii_compat_fixed = 0;
2934         rb_encoding *has_ascii_incompat = 0;
2935
2936         for (i = 0; i < argc; i++) {
2937             volatile VALUE v;
2938             VALUE e = rb_ary_entry(args0, i);
2939
2940             if (0 < i)
2941                 rb_str_buf_cat_ascii(source, "|");
2942
2943             v = rb_check_regexp_type(e);
2944             if (!NIL_P(v)) {
2945                 rb_encoding *enc = rb_enc_get(v);
2946                 if (!rb_enc_asciicompat(enc)) {
2947                     if (!has_ascii_incompat)
2948                         has_ascii_incompat = enc;
2949                     else if (has_ascii_incompat != enc)
2950                         rb_raise(rb_eArgError, "incompatible encodings: %s and %s",
2951                             rb_enc_name(has_ascii_incompat), rb_enc_name(enc));
2952                 }
2953                 else if (rb_reg_fixed_encoding_p(v)) {
2954                     if (!has_ascii_compat_fixed)
2955                         has_ascii_compat_fixed = enc;
2956                     else if (has_ascii_compat_fixed != enc)
2957                         rb_raise(rb_eArgError, "incompatible encodings: %s and %s",
2958                             rb_enc_name(has_ascii_compat_fixed), rb_enc_name(enc));
2959                 }
2960                 else {
2961                     has_asciionly = 1;
2962                 }
2963                 v = rb_reg_to_s(v);
2964             }
2965             else {
2966                 rb_encoding *enc = rb_enc_get(e);
2967                 StringValue(e);
2968                 enc = rb_enc_get(e);
2969                 if (!rb_enc_str_asciicompat_p(e)) {
2970                     if (!has_ascii_incompat)
2971                         has_ascii_incompat = enc;
2972                     else if (has_ascii_incompat != enc)
2973                         rb_raise(rb_eArgError, "incompatible encodings: %s and %s",
2974                             rb_enc_name(has_ascii_incompat), rb_enc_name(enc));
2975                 }
2976                 else if (rb_enc_str_asciionly_p(e)) {
2977                     has_asciionly = 1;
2978                 }
2979                 else {
2980                     if (!has_ascii_compat_fixed)
2981                         has_ascii_compat_fixed = enc;
2982                     else if (has_ascii_compat_fixed != enc)
2983                         rb_raise(rb_eArgError, "incompatible encodings: %s and %s",
2984                             rb_enc_name(has_ascii_compat_fixed), rb_enc_name(enc));
2985                 }
2986                 v = rb_reg_s_quote(Qnil, e);
2987             }
2988             if (has_ascii_incompat) {
2989                 if (has_asciionly) {
2990                     rb_raise(rb_eArgError, "ASCII incompatible encoding: %s",
2991                         rb_enc_name(has_ascii_incompat));
2992                 }
2993                 if (has_ascii_compat_fixed) {
2994                     rb_raise(rb_eArgError, "incompatible encodings: %s and %s",
2995                         rb_enc_name(has_ascii_incompat), rb_enc_name(has_ascii_compat_fixed));
2996                 }
2997             }
2998
2999             if (i == 0) {
3000                 rb_enc_copy(source, v);
3001             }
3002             rb_str_append(source, v);
3003         }
3004
3005         if (has_ascii_incompat) {
3006             result_enc = has_ascii_incompat;
3007         }
3008         else if (has_ascii_compat_fixed) {
3009             result_enc = has_ascii_compat_fixed;
3010         }
3011         else {
3012             result_enc = rb_ascii8bit_encoding();
3013         }
3014
3015         rb_enc_associate(source, result_enc);
3016         return rb_class_new_instance(1, &source, rb_cRegexp);
3017     }
3018 }
3019
3020 /*
3021  *  call-seq:
3022  *     Regexp.union(pat1, pat2, ...)            => new_regexp
3023  *     Regexp.union(pats_ary)                   => new_regexp
3024  *
3025  *  Return a <code>Regexp</code> object that is the union of the given
3026  *  <em>pattern</em>s, i.e., will match any of its parts. The <em>pattern</em>s
3027  *  can be Regexp objects, in which case their options will be preserved, or
3028  *  Strings. If no patterns are given, returns <code>/(?!)/</code>.
3029  *
3030  *     Regexp.union                         #=> /(?!)/
3031  *     Regexp.union("penzance")             #=> /penzance/
3032  *     Regexp.union("a+b*c")                #=> /a\+b\*c/
3033  *     Regexp.union("skiing", "sledding")   #=> /skiing|sledding/
3034  *     Regexp.union(["skiing", "sledding"]) #=> /skiing|sledding/
3035  *     Regexp.union(/dogs/, /cats/i)        #=> /(?-mix:dogs)|(?i-mx:cats)/
3036  */
3037 static VALUE
3038 rb_reg_s_union_m(VALUE self, VALUE args)
3039 {
3040     VALUE v;
3041     if (RARRAY_LEN(args) == 1 &&
3042         !NIL_P(v = rb_check_array_type(rb_ary_entry(args, 0)))) {
3043         return rb_reg_s_union(self, v);
3044     }
3045     return rb_reg_s_union(self, args);
3046 }
3047
3048 /* :nodoc: */
3049 static VALUE
3050 rb_reg_init_copy(VALUE copy, VALUE re)
3051 {
3052     onig_errmsg_buffer err = "";
3053     const char *s;
3054     long len;
3055
3056     if (copy == re) return copy;
3057     rb_check_frozen(copy);
3058     /* need better argument type check */
3059     if (!rb_obj_is_instance_of(re, rb_obj_class(copy))) {
3060         rb_raise(rb_eTypeError, "wrong argument type");
3061     }
3062     rb_reg_check(re);
3063     s = RREGEXP(re)->str;
3064     len = RREGEXP(re)->len;
3065     if (rb_reg_initialize(copy, s, len, rb_enc_get(re), rb_reg_options(re), err) != 0) {
3066         rb_reg_raise(s, len, err, re);
3067     }
3068     return copy;
3069 }
3070
3071 VALUE
3072 rb_reg_regsub(VALUE str, VALUE src, struct re_registers *regs, VALUE regexp)
3073 {
3074     VALUE val = 0;
3075     char *p, *s, *e;
3076     int no, clen;
3077     rb_encoding *str_enc = rb_enc_get(str);
3078     rb_encoding *src_enc = rb_enc_get(src);
3079
3080     p = s = RSTRING_PTR(str);
3081     e = s + RSTRING_LEN(str);
3082
3083     while (s < e) {
3084         int c = rb_enc_ascget(s, e, &clen, str_enc);
3085         char *ss;
3086
3087         if (c == -1) {
3088             s += mbclen(s, e, str_enc);
3089             continue;
3090         }
3091         ss = s;
3092         s += clen;
3093
3094         if (c != '\\' || s == e) continue;
3095
3096         if (!val) {
3097             val = rb_str_buf_new(ss-p);
3098         }
3099         rb_enc_str_buf_cat(val, p, ss-p, str_enc);
3100
3101         c = rb_enc_ascget(s, e, &clen, str_enc);
3102         if (c == -1) {
3103             s += mbclen(s, e, str_enc);
3104             rb_enc_str_buf_cat(val, ss, s-ss, str_enc);
3105             p = s;
3106             continue;
3107         }
3108         s += clen;
3109
3110         p = s;
3111         switch (c) {
3112           case '1': case '2': case '3': case '4':
3113           case '5': case '6': case '7': case '8': case '9':
3114             if (onig_noname_group_capture_is_active(RREGEXP(regexp)->ptr)) {
3115                 no = c - '0';
3116             }
3117             else {
3118                 continue;
3119             }
3120             break;
3121
3122           case 'k':
3123             if (s < e && rb_enc_ascget(s, e, &clen, str_enc) == '<') {
3124                 char *name, *name_end;
3125
3126                 name_end = name = s + clen;
3127                 while (name_end < e) {
3128                     c = rb_enc_ascget(name_end, e, &clen, str_enc);
3129                     if (c == '>') break;
3130                     name_end += c == -1 ? mbclen(name_end, e, str_enc) : clen;
3131                 }
3132                 if (name_end < e) {
3133                     no = name_to_backref_number(regs, regexp, name, name_end);
3134                     p = s = name_end + clen;
3135                     break;
3136                 }
3137                 else {
3138                     rb_raise(rb_eRuntimeError, "invalid group name reference format");
3139                 }
3140             }
3141
3142             rb_enc_str_buf_cat(val, ss, s-ss, str_enc);
3143             continue;
3144
3145           case '0':
3146           case '&':
3147             no = 0;
3148             break;
3149
3150           case '`':
3151             rb_enc_str_buf_cat(val, RSTRING_PTR(src), BEG(0), src_enc);
3152             continue;
3153
3154           case '\'':
3155             rb_enc_str_buf_cat(val, RSTRING_PTR(src)+END(0), RSTRING_LEN(src)-END(0), src_enc);
3156             continue;
3157
3158           case '+':
3159             no = regs->num_regs-1;
3160             while (BEG(no) == -1 && no > 0) no--;
3161             if (no == 0) continue;
3162             break;
3163
3164           case '\\':
3165             rb_enc_str_buf_cat(val, s-clen, clen, str_enc);
3166             continue;
3167
3168           default:
3169             rb_enc_str_buf_cat(val, ss, s-ss, str_enc);
3170             continue;
3171         }
3172
3173         if (no >= 0) {
3174             if (no >= regs->num_regs) continue;
3175             if (BEG(no) == -1) continue;
3176             rb_enc_str_buf_cat(val, RSTRING_PTR(src)+BEG(no), END(no)-BEG(no), src_enc);
3177         }
3178     }
3179
3180     if (!val) return str;
3181     if (p < e) {
3182         rb_enc_str_buf_cat(val, p, e-p, str_enc);
3183     }
3184
3185     return val;
3186 }
3187
3188 static VALUE
3189 kcode_getter(void)
3190 {
3191     rb_warn("variable $KCODE is no longer effective");
3192     return Qnil;
3193 }
3194
3195 static void
3196 kcode_setter(VALUE val, ID id)
3197 {
3198     rb_warn("variable $KCODE is no longer effective; ignored");
3199 }
3200
3201 static VALUE
3202 ignorecase_getter(void)
3203 {
3204     rb_warn("variable $= is no longer effective");
3205     return Qfalse;
3206 }
3207
3208 static void
3209 ignorecase_setter(VALUE val, ID id)
3210 {
3211     rb_warn("variable $= is no longer effective; ignored");
3212 }
3213
3214 static VALUE
3215 match_getter(void)
3216 {
3217     VALUE match = rb_backref_get();
3218
3219     if (NIL_P(match)) return Qnil;
3220     rb_match_busy(match);
3221     return match;
3222 }
3223
3224 static void
3225 match_setter(VALUE val)
3226 {
3227     if (!NIL_P(val)) {
3228         Check_Type(val, T_MATCH);
3229     }
3230     rb_backref_set(val);
3231 }
3232
3233 /*
3234  *  call-seq:
3235  *     Regexp.last_match           => matchdata
3236  *     Regexp.last_match(n)        => str
3237  *
3238  *  The first form returns the <code>MatchData</code> object generated by the
3239  *  last successful pattern match. Equivalent to reading the global variable
3240  *  <code>$~</code>. The second form returns the <i>n</i>th field in this
3241  *  <code>MatchData</code> object.
3242  *  <em>n</em> can be a string or symbol to reference a named capture.
3243  *
3244  *     /c(.)t/ =~ 'cat'        #=> 0
3245  *     Regexp.last_match       #=> #<MatchData "cat" 1:"a">
3246  *     Regexp.last_match(0)    #=> "cat"
3247  *     Regexp.last_match(1)    #=> "a"
3248  *     Regexp.last_match(2)    #=> nil
3249  *
3250  *     /(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/ =~ "var = val"
3251  *     Regexp.last_match       #=> #<MatchData "var = val" lhs:"var" rhs:"val">
3252  *     Regexp.last_match(:lhs) #=> "var"
3253  *     Regexp.last_match(:rhs) #=> "val"
3254  */
3255
3256 static VALUE
3257 rb_reg_s_last_match(int argc, VALUE *argv)
3258 {
3259     VALUE nth;
3260
3261     if (argc > 0 && rb_scan_args(argc, argv, "01", &nth) == 1) {
3262         VALUE match = rb_backref_get();
3263         int n;
3264         if (NIL_P(match)) return Qnil;
3265         n = match_backref_number(match, nth);
3266         return rb_reg_nth_match(n, match);
3267     }
3268     return match_getter();
3269 }
3270
3271 static void
3272 re_warn(const char *s)
3273 {
3274     rb_warn("%s", s);
3275 }
3276
3277 /*
3278  *  Document-class: Regexp
3279  *
3280  *  A <code>Regexp</code> holds a regular expression, used to match a pattern
3281  *  against strings. Regexps are created using the <code>/.../</code> and
3282  *  <code>%r{...}</code> literals, and by the <code>Regexp::new</code>
3283  *  constructor.
3284  *
3285  */
3286
3287 void
3288 Init_Regexp(void)
3289 {
3290     rb_eRegexpError = rb_define_class("RegexpError", rb_eStandardError);
3291
3292     onigenc_set_default_caseconv_table((UChar*)casetable);
3293     onigenc_set_default_encoding(ONIG_ENCODING_ASCII);
3294     onig_set_warn_func(re_warn);
3295     onig_set_verb_warn_func(re_warn);
3296
3297     rb_define_virtual_variable("$~", match_getter, match_setter);
3298     rb_define_virtual_variable("$&", last_match_getter, 0);
3299     rb_define_virtual_variable("$`", prematch_getter, 0);
3300     rb_define_virtual_variable("$'", postmatch_getter, 0);
3301     rb_define_virtual_variable("$+", last_paren_match_getter, 0);
3302
3303     rb_define_virtual_variable("$=", ignorecase_getter, ignorecase_setter);
3304     rb_define_virtual_variable("$KCODE", kcode_getter, kcode_setter);
3305     rb_define_virtual_variable("$-K", kcode_getter, kcode_setter);
3306
3307     rb_cRegexp = rb_define_class("Regexp", rb_cObject);
3308     rb_define_alloc_func(rb_cRegexp, rb_reg_s_alloc);
3309     rb_define_singleton_method(rb_cRegexp, "compile", rb_class_new_instance, -1);
3310     rb_define_singleton_method(rb_cRegexp, "quote", rb_reg_s_quote, 1);
3311     rb_define_singleton_method(rb_cRegexp, "escape", rb_reg_s_quote, 1);
3312     rb_define_singleton_method(rb_cRegexp, "union", rb_reg_s_union_m, -2);
3313     rb_define_singleton_method(rb_cRegexp, "last_match", rb_reg_s_last_match, -1);
3314     rb_define_singleton_method(rb_cRegexp, "try_convert", rb_reg_s_try_convert, 1);
3315
3316     rb_define_method(rb_cRegexp, "initialize", rb_reg_initialize_m, -1);
3317     rb_define_method(rb_cRegexp, "initialize_copy", rb_reg_init_copy, 1);
3318     rb_define_method(rb_cRegexp, "hash", rb_reg_hash, 0);
3319     rb_define_method(rb_cRegexp, "eql?", rb_reg_equal, 1);
3320     rb_define_method(rb_cRegexp, "==", rb_reg_equal, 1);
3321     rb_define_method(rb_cRegexp, "=~", rb_reg_match, 1);
3322     rb_define_method(rb_cRegexp, "===", rb_reg_eqq, 1);
3323     rb_define_method(rb_cRegexp, "~", rb_reg_match2, 0);
3324     rb_define_method(rb_cRegexp, "match", rb_reg_match_m, -1);
3325     rb_define_method(rb_cRegexp, "to_s", rb_reg_to_s, 0);
3326     rb_define_method(rb_cRegexp, "inspect", rb_reg_inspect, 0);
3327     rb_define_method(rb_cRegexp, "source", rb_reg_source, 0);
3328     rb_define_method(rb_cRegexp, "casefold?", rb_reg_casefold_p, 0);
3329     rb_define_method(rb_cRegexp, "options", rb_reg_options_m, 0);
3330     rb_define_method(rb_cRegexp, "encoding", rb_obj_encoding, 0); /* in encoding.c */
3331     rb_define_method(rb_cRegexp, "fixed_encoding?", rb_reg_fixed_encoding_p, 0);
3332     rb_define_method(rb_cRegexp, "names", rb_reg_names, 0);
3333     rb_define_method(rb_cRegexp, "named_captures", rb_reg_named_captures, 0);
3334
3335     rb_define_const(rb_cRegexp, "IGNORECASE", INT2FIX(ONIG_OPTION_IGNORECASE));
3336     rb_define_const(rb_cRegexp, "EXTENDED", INT2FIX(ONIG_OPTION_EXTEND));
3337     rb_define_const(rb_cRegexp, "MULTILINE", INT2FIX(ONIG_OPTION_MULTILINE));
3338
3339     rb_global_variable(&reg_cache);
3340
3341     rb_cMatch  = rb_define_class("MatchData", rb_cObject);
3342     rb_define_alloc_func(rb_cMatch, match_alloc);
3343     rb_undef_method(CLASS_OF(rb_cMatch), "new");
3344
3345     rb_define_method(rb_cMatch, "initialize_copy", match_init_copy, 1);
3346     rb_define_method(rb_cMatch, "regexp", match_regexp, 0);
3347     rb_define_method(rb_cMatch, "names", match_names, 0);
3348     rb_define_method(rb_cMatch, "size", match_size, 0);
3349     rb_define_method(rb_cMatch, "length", match_size, 0);
3350     rb_define_method(rb_cMatch, "offset", match_offset, 1);
3351     rb_define_method(rb_cMatch, "begin", match_begin, 1);
3352     rb_define_method(rb_cMatch, "end", match_end, 1);
3353     rb_define_method(rb_cMatch, "to_a", match_to_a, 0);
3354     rb_define_method(rb_cMatch, "[]", match_aref, -1);
3355     rb_define_method(rb_cMatch, "captures", match_captures, 0);
3356     rb_define_method(rb_cMatch, "values_at", match_values_at, -1);
3357     rb_define_method(rb_cMatch, "pre_match", rb_reg_match_pre, 0);
3358     rb_define_method(rb_cMatch, "post_match", rb_reg_match_post, 0);
3359     rb_define_method(rb_cMatch, "to_s", match_to_s, 0);
3360     rb_define_method(rb_cMatch, "inspect", match_inspect, 0);
3361     rb_define_method(rb_cMatch, "string", match_string, 0);
3362 }