encoding.c

   1 /**********************************************************************
   2
   3   encoding.c -
   4
   5   $Author$
   6   created at: Thu May 24 17:23:27 JST 2007
   7
   8   Copyright (C) 2007 Yukihiro Matsumoto
   9
  10 **********************************************************************/
  11
  12 #include "ruby/ruby.h"
  13 #include "ruby/encoding.h"
  14 #include "regenc.h"
  15 #include <ctype.h>
  16 #ifdef HAVE_LANGINFO_H
  17 #include <langinfo.h>
  18 #endif
  19 #include "ruby/util.h"
  20
  21 static ID id_encoding, id_base_encoding;
  22 VALUE rb_cEncoding;
  23 static VALUE rb_encoding_list;
  24
  25 struct rb_encoding_entry {
  26     const char *name;
  27     rb_encoding *enc;
  28     rb_encoding *base;
  29 };
  30
  31 static struct {
  32     struct rb_encoding_entry *list;
  33     int count;
  34     int size;
  35     st_table *names;
  36 } enc_table;
  37
  38 void rb_enc_init(void);
  39
  40 #define ENCODING_COUNT ENCINDEX_BUILTIN_MAX
  41
  42 #define enc_autoload_p(enc) (!rb_enc_mbmaxlen(enc))
  43
  44 static int load_encoding(const char *name);
  45 static VALUE enc_base_encoding(VALUE self);
  46
  47 static void
  48 enc_mark(void *ptr)
  49 {
  50 }
  51
  52 static VALUE
  53 enc_new(rb_encoding *encoding)
  54 {
  55     return Data_Wrap_Struct(rb_cEncoding, enc_mark, 0, encoding);
  56 }
  57
  58 VALUE
  59 rb_enc_from_encoding(rb_encoding *encoding)
  60 {
  61     VALUE list, enc;
  62     int idx;
  63
  64     if (!encoding) return Qnil;
  65     idx = ENC_TO_ENCINDEX(encoding);
  66     if (!(list = rb_encoding_list)) {
  67         rb_bug("rb_enc_from_encoding(%d\"%s\"): no rb_encoding_list",
  68                idx, rb_enc_name(encoding));
  69     }
  70     enc = rb_ary_entry(list, idx);
  71     if (NIL_P(enc)) {
  72         rb_bug("rb_enc_from_encoding(%d\"%s\"): not created yet",
  73                idx, rb_enc_name(encoding));
  74     }
  75     return enc;
  76 }
  77
  78 static int enc_autoload(rb_encoding *);
  79
  80 static int
  81 check_encoding(rb_encoding *enc)
  82 {
  83     int index = rb_enc_to_index(enc);
  84     if (rb_enc_from_index(index) != enc)
  85         return -1;
  86     if (enc_autoload_p(enc)) {
  87         index = enc_autoload(enc);
  88     }
  89     return index;
  90 }
  91
  92 static int
  93 enc_check_encoding(VALUE obj)
  94 {
  95     if (SPECIAL_CONST_P(obj) || BUILTIN_TYPE(obj) != T_DATA ||
  96         RDATA(obj)->dmark != enc_mark) {
  97         return -1;
  98     }
  99     return check_encoding(RDATA(obj)->data);
 100 }
 101
 102 static int
 103 must_encoding(VALUE enc)
 104 {
 105     int index = enc_check_encoding(enc);
 106     if (index < 0) {
 107         rb_raise(rb_eTypeError, "wrong argument type %s (expected Encoding)",
 108                  rb_obj_classname(enc));
 109     }
 110     return index;
 111 }
 112
 113 int
 114 rb_to_encoding_index(VALUE enc)
 115 {
 116     int idx;
 117
 118     idx = enc_check_encoding(enc);
 119     if (idx >= 0) {
 120         return idx;
 121     }
 122     else if (NIL_P(enc = rb_check_string_type(enc))) {
 123         return -1;
 124     }
 125     if (!rb_enc_asciicompat(rb_enc_get(enc))) {
 126         return -1;
 127     }
 128     return rb_enc_find_index(StringValueCStr(enc));
 129 }
 130
 131 static rb_encoding *
 132 to_encoding(VALUE enc)
 133 {
 134     int idx;
 135
 136     StringValue(enc);
 137     if (!rb_enc_asciicompat(rb_enc_get(enc))) {
 138         rb_raise(rb_eArgError, "invalid name encoding (non ASCII)");
 139     }
 140     idx = rb_enc_find_index(StringValueCStr(enc));
 141     if (idx < 0) {
 142         rb_raise(rb_eArgError, "unknown encoding name - %s", RSTRING_PTR(enc));
 143     }
 144     return rb_enc_from_index(idx);
 145 }
 146
 147 rb_encoding *
 148 rb_to_encoding(VALUE enc)
 149 {
 150     if (enc_check_encoding(enc) >= 0) return RDATA(enc)->data;
 151     return to_encoding(enc);
 152 }
 153
 154 void
 155 rb_gc_mark_encodings(void)
 156 {
 157 }
 158
 159 static int
 160 enc_table_expand(int newsize)
 161 {
 162     struct rb_encoding_entry *ent;
 163     int count = newsize;
 164
 165     if (enc_table.size >= newsize) return newsize;
 166     newsize = (newsize + 7) / 8 * 8;
 167     ent = realloc(enc_table.list, sizeof(*enc_table.list) * newsize);
 168     if (!ent) return -1;
 169     memset(ent + enc_table.size, 0, sizeof(*ent)*(newsize - enc_table.size));
 170     enc_table.list = ent;
 171     enc_table.size = newsize;
 172     return count;
 173 }
 174
 175 static int
 176 enc_register_at(int index, const char *name, rb_encoding *encoding)
 177 {
 178     struct rb_encoding_entry *ent = &enc_table.list[index];
 179     VALUE list;
 180
 181     if (!ent->name) {
 182         ent->name = name = strdup(name);
 183     }
 184     else if (STRCASECMP(name, ent->name)) {
 185         return -1;
 186     }
 187     if (!ent->enc) {
 188         ent->enc = xmalloc(sizeof(rb_encoding));
 189     }
 190     if (encoding) {
 191         *ent->enc = *encoding;
 192     }
 193     else {
 194         memset(ent->enc, 0, sizeof(*ent->enc));
 195     }
 196     encoding = ent->enc;
 197     encoding->name = name;
 198     encoding->ruby_encoding_index = index;
 199     st_insert(enc_table.names, (st_data_t)name, (st_data_t)index);
 200     list = rb_encoding_list;
 201     if (list && NIL_P(rb_ary_entry(list, index))) {
 202         /* initialize encoding data */
 203         rb_ary_store(list, index, enc_new(encoding));
 204     }
 205     return index;
 206 }
 207
 208 static int
 209 enc_register(const char *name, rb_encoding *encoding)
 210 {
 211     int index = enc_table.count;
 212
 213     if ((index = enc_table_expand(index + 1)) < 0) return -1;
 214     enc_table.count = index;
 215     return enc_register_at(index - 1, name, encoding);
 216 }
 217
 218 static void set_encoding_const(const char *, rb_encoding *);
 219 int rb_enc_registered(const char *name);
 220
 221 int
 222 rb_enc_register(const char *name, rb_encoding *encoding)
 223 {
 224     int index = rb_enc_registered(name);
 225
 226     if (index >= 0) {
 227         rb_encoding *oldenc = rb_enc_from_index(index);
 228         if (STRCASECMP(name, rb_enc_name(oldenc))) {
 229             index = enc_register(name, encoding);
 230         }
 231         else if (enc_autoload_p(oldenc) || !ENC_DUMMY_P(oldenc)) {
 232             enc_register_at(index, name, encoding);
 233         }
 234         else {
 235             rb_raise(rb_eArgError, "encoding %s is already registered", name);
 236         }
 237     }
 238     else {
 239         index = enc_register(name, encoding);
 240         set_encoding_const(name, rb_enc_from_index(index));
 241     }
 242     return index;
 243 }
 244
 245 void
 246 rb_encdb_declare(const char *name)
 247 {
 248     int idx = rb_enc_registered(name);
 249     if (idx < 0) {
 250         idx = enc_register(name, 0);
 251     }
 252     set_encoding_const(name, rb_enc_from_index(idx));
 253 }
 254
 255 static void
 256 enc_check_duplication(const char *name)
 257 {
 258     if (rb_enc_registered(name) >= 0) {
 259         rb_raise(rb_eArgError, "encoding %s is already registered", name);
 260     }
 261 }
 262
 263 static rb_encoding*
 264 set_base_encoding(int index, rb_encoding *base)
 265 {
 266     rb_encoding *enc = enc_table.list[index].enc;
 267
 268     enc_table.list[index].base = base;
 269     if (rb_enc_dummy_p(base)) ENC_SET_DUMMY(enc);
 270     return enc;
 271 }
 272
 273 int
 274 rb_enc_replicate(const char *name, rb_encoding *encoding)
 275 {
 276     int idx;
 277
 278     enc_check_duplication(name);
 279     idx = enc_register(name, encoding);
 280     set_base_encoding(idx, encoding);
 281     set_encoding_const(name, rb_enc_from_index(idx));
 282     return idx;
 283 }
 284
 285 static int
 286 enc_replicate(int idx, const char *name, rb_encoding *origenc)
 287 {
 288     if (idx < 0) {
 289         idx = enc_register(name, origenc);
 290     }
 291     else {
 292         idx = enc_register_at(idx, name, origenc);
 293     }
 294     if (idx >= 0) {
 295         set_base_encoding(idx, origenc);
 296         set_encoding_const(name, rb_enc_from_index(idx));
 297     }
 298     return idx;
 299 }
 300
 301 int
 302 rb_encdb_replicate(const char *name, const char *orig)
 303 {
 304     int origidx = rb_enc_registered(orig);
 305     int idx = rb_enc_registered(name);
 306
 307     if (origidx < 0) {
 308         origidx = enc_register(orig, 0);
 309     }
 310     return enc_replicate(idx, name, rb_enc_from_index(origidx));
 311 }
 312
 313 int
 314 rb_define_dummy_encoding(const char *name)
 315 {
 316     int index = rb_enc_replicate(name, rb_ascii8bit_encoding());
 317     rb_encoding *enc = enc_table.list[index].enc;
 318
 319     ENC_SET_DUMMY(enc);
 320     return index;
 321 }
 322
 323 int
 324 rb_encdb_dummy(const char *name)
 325 {
 326     int index = enc_replicate(rb_enc_registered(name), name,
 327                               rb_ascii8bit_encoding());
 328     rb_encoding *enc = enc_table.list[index].enc;
 329
 330     ENC_SET_DUMMY(enc);
 331     return index;
 332 }
 333
 334 /*
 335  * call-seq:
 336  *   enc.dummy? => true or false
 337  *
 338  * Returns true for dummy encodings.
 339  * A dummy encoding is an encoding for which character handling is not properly
 340  * implemented.
 341  * It is used for stateful encodings.
 342  *
 343  *   Encoding::ISO_2022_JP.dummy?       #=> true
 344  *   Encoding::UTF_8.dummy?             #=> false
 345  *
 346  */
 347 static VALUE
 348 enc_dummy_p(VALUE enc)
 349 {
 350     return ENC_DUMMY_P(enc_table.list[must_encoding(enc)].enc) ? Qtrue : Qfalse;
 351 }
 352
 353 static int
 354 enc_alias(const char *alias, int idx)
 355 {
 356     alias = strdup(alias);
 357     st_insert(enc_table.names, (st_data_t)alias, (st_data_t)idx);
 358     set_encoding_const(alias, rb_enc_from_index(idx));
 359     return idx;
 360 }
 361
 362 int
 363 rb_enc_alias(const char *alias, const char *orig)
 364 {
 365     int idx;
 366
 367     enc_check_duplication(alias);
 368     if (!enc_table.list) {
 369         rb_enc_init();
 370     }
 371     if ((idx = rb_enc_find_index(orig)) < 0) {
 372         return -1;
 373     }
 374     return enc_alias(alias, idx);
 375 }
 376
 377 int
 378 rb_encdb_alias(const char *alias, const char *orig)
 379 {
 380     int idx = rb_enc_registered(orig);
 381
 382     if (idx < 0) {
 383         idx = enc_register(orig, 0);
 384     }
 385     return enc_alias(alias, idx);
 386 }
 387
 388 enum {
 389     ENCINDEX_ASCII,
 390     ENCINDEX_UTF_8,
 391     ENCINDEX_US_ASCII,
 392     ENCINDEX_BUILTIN_MAX
 393 };
 394
 395 extern rb_encoding OnigEncodingUTF_8;
 396 extern rb_encoding OnigEncodingUS_ASCII;
 397
 398 void
 399 rb_enc_init(void)
 400 {
 401     enc_table_expand(ENCODING_COUNT + 1);
 402     if (!enc_table.names) {
 403         enc_table.names = st_init_strcasetable();
 404     }
 405 #define ENC_REGISTER(enc) enc_register_at(ENCINDEX_##enc, rb_enc_name(&OnigEncoding##enc), &OnigEncoding##enc)
 406     ENC_REGISTER(ASCII);
 407     ENC_REGISTER(UTF_8);
 408     ENC_REGISTER(US_ASCII);
 409 #undef ENC_REGISTER
 410     enc_table.count = ENCINDEX_BUILTIN_MAX;
 411 }
 412
 413 rb_encoding *
 414 rb_enc_from_index(int index)
 415 {
 416     if (!enc_table.list) {
 417         rb_enc_init();
 418     }
 419     if (index < 0 || enc_table.count <= index) {
 420         return 0;
 421     }
 422     return enc_table.list[index].enc;
 423 }
 424
 425 int
 426 rb_enc_registered(const char *name)
 427 {
 428     st_data_t idx = 0;
 429
 430     if (!name) return -1;
 431     if (!enc_table.list) return -1;
 432     if (st_lookup(enc_table.names, (st_data_t)name, &idx)) {
 433         return (int)idx;
 434     }
 435     return -1;
 436 }
 437
 438 static VALUE
 439 require_enc(VALUE enclib)
 440 {
 441     return rb_require_safe(enclib, rb_safe_level());
 442 }
 443
 444 static int
 445 load_encoding(const char *name)
 446 {
 447     VALUE enclib = rb_sprintf("enc/%s", name);
 448     VALUE verbose = ruby_verbose;
 449     VALUE debug = ruby_debug;
 450     VALUE loaded;
 451     char *s = RSTRING_PTR(enclib) + 4, *e = RSTRING_END(enclib);
 452     int idx;
 453
 454     while (s < e) {
 455         if (!ISALNUM(*s)) *s = '_';
 456         else if (ISUPPER(*s)) *s = TOLOWER(*s);
 457         ++s;
 458     }
 459     OBJ_FREEZE(enclib);
 460     ruby_verbose = Qfalse;
 461     ruby_debug = Qfalse;
 462     loaded = rb_protect(require_enc, enclib, 0);
 463     ruby_verbose = verbose;
 464     ruby_debug = debug;
 465     rb_set_errinfo(Qnil);
 466     if (NIL_P(loaded)) return -1;
 467     if ((idx = rb_enc_registered(name)) < 0) return -1;
 468     if (enc_autoload_p(enc_table.list[idx].enc)) return -1;
 469     return idx;
 470 }
 471
 472 static int
 473 enc_autoload(rb_encoding *enc)
 474 {
 475     int i;
 476     rb_encoding *base = enc_table.list[ENC_TO_ENCINDEX(enc)].base;
 477
 478     if (base) {
 479         i = 0;
 480         do {
 481             if (i >= enc_table.count) return -1;
 482         } while (enc_table.list[i].enc != base && (++i, 1));
 483         if (enc_autoload_p(base)) {
 484             if (enc_autoload(base) < 0) return -1;
 485         }
 486         i = ENC_TO_ENCINDEX(enc);
 487         enc_register_at(i, rb_enc_name(enc), base);
 488     }
 489     else {
 490         i = load_encoding(rb_enc_name(enc));
 491     }
 492     return i;
 493 }
 494
 495 int
 496 rb_enc_find_index(const char *name)
 497 {
 498     int i = rb_enc_registered(name);
 499     rb_encoding *enc;
 500
 501     if (i < 0) {
 502         i = load_encoding(name);
 503     }
 504     else if (enc_autoload_p(enc = rb_enc_from_index(i))) {
 505         if (enc_autoload(enc) < 0) {
 506             rb_warn("failed to load encoding (%s); use ASCII-8BIT instead",
 507                     name);
 508             return 0;
 509         }
 510     }
 511     return i;
 512 }
 513
 514 rb_encoding *
 515 rb_enc_find(const char *name)
 516 {
 517     int idx = rb_enc_find_index(name);
 518     if (idx < 0) idx = 0;
 519     return rb_enc_from_index(idx);
 520 }
 521
 522 static inline int
 523 enc_capable(VALUE obj)
 524 {
 525     if (SPECIAL_CONST_P(obj)) return Qfalse;
 526     switch (BUILTIN_TYPE(obj)) {
 527       case T_STRING:
 528       case T_REGEXP:
 529       case T_FILE:
 530         return Qtrue;
 531       case T_DATA:
 532         if (RDATA(obj)->dmark == enc_mark) return Qtrue;
 533       default:
 534         return Qfalse;
 535     }
 536 }
 537
 538 ID
 539 rb_id_encoding(void)
 540 {
 541     CONST_ID(id_encoding, "encoding");
 542     return id_encoding;
 543 }
 544
 545 int
 546 rb_enc_get_index(VALUE obj)
 547 {
 548     int i;
 549
 550     i = ENCODING_GET_INLINED(obj);
 551     if (i == ENCODING_INLINE_MAX) {
 552         VALUE iv;
 553
 554         iv = rb_ivar_get(obj, rb_id_encoding());
 555         i = NUM2INT(iv);
 556     }
 557     return i;
 558 }
 559
 560 void
 561 rb_enc_set_index(VALUE obj, int idx)
 562 {
 563     if (idx < ENCODING_INLINE_MAX) {
 564         ENCODING_SET_INLINED(obj, idx);
 565         return;
 566     }
 567     ENCODING_SET_INLINED(obj, ENCODING_INLINE_MAX);
 568     rb_ivar_set(obj, rb_id_encoding(), INT2NUM(idx));
 569     return;
 570 }
 571
 572 VALUE
 573 rb_enc_associate_index(VALUE obj, int idx)
 574 {
 575 /*    enc_check_capable(obj);*/
 576     if (rb_enc_get_index(obj) == idx)
 577         return obj;
 578     if (!ENC_CODERANGE_ASCIIONLY(obj) ||
 579         !rb_enc_asciicompat(rb_enc_from_index(idx))) {
 580         ENC_CODERANGE_CLEAR(obj);
 581     }
 582     rb_enc_set_index(obj, idx);
 583     return obj;
 584 }
 585
 586 VALUE
 587 rb_enc_associate(VALUE obj, rb_encoding *enc)
 588 {
 589     return rb_enc_associate_index(obj, rb_enc_to_index(enc));
 590 }
 591
 592 rb_encoding*
 593 rb_enc_get(VALUE obj)
 594 {
 595     return rb_enc_from_index(rb_enc_get_index(obj));
 596 }
 597
 598 rb_encoding*
 599 rb_enc_check(VALUE str1, VALUE str2)
 600 {
 601     rb_encoding *enc = rb_enc_compatible(str1, str2);
 602     if (!enc)
 603         rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
 604                  rb_enc_name(rb_enc_get(str1)),
 605                  rb_enc_name(rb_enc_get(str2)));
 606     return enc;
 607 }
 608
 609 rb_encoding*
 610 rb_enc_compatible(VALUE str1, VALUE str2)
 611 {
 612     int idx1, idx2;
 613     rb_encoding *enc1, *enc2;
 614
 615     idx1 = rb_enc_get_index(str1);
 616     idx2 = rb_enc_get_index(str2);
 617
 618     if (idx1 < 0 || idx2 < 0)
 619         return 0;
 620
 621     if (idx1 == idx2) {
 622         return rb_enc_from_index(idx1);
 623     }
 624     enc1 = rb_enc_from_index(idx1);
 625     enc2 = rb_enc_from_index(idx2);
 626
 627     if (TYPE(str2) == T_STRING && RSTRING_LEN(str2) == 0)
 628         return enc1;
 629     if (TYPE(str1) == T_STRING && RSTRING_LEN(str1) == 0)
 630         return enc2;
 631     if (!rb_enc_asciicompat(enc1) || !rb_enc_asciicompat(enc2)) {
 632         return 0;
 633     }
 634
 635     if (BUILTIN_TYPE(str1) != T_STRING) {
 636         VALUE tmp = str1;
 637         int idx0 = idx1;
 638         str1 = str2;
 639         str2 = tmp;
 640         idx1 = idx2;
 641         idx2 = idx0;
 642     }
 643     if (BUILTIN_TYPE(str1) == T_STRING) {
 644         int cr1, cr2;
 645
 646         cr1 = rb_enc_str_coderange(str1);
 647         if (BUILTIN_TYPE(str2) == T_STRING) {
 648             cr2 = rb_enc_str_coderange(str2);
 649             if (cr1 != cr2) {
 650                 /* may need to handle ENC_CODERANGE_BROKEN */
 651                 if (cr1 == ENC_CODERANGE_7BIT) return enc2;
 652                 if (cr2 == ENC_CODERANGE_7BIT) return enc1;
 653             }
 654             if (cr2 == ENC_CODERANGE_7BIT) {
 655                 if (idx1 == 0) return enc2;
 656                 return enc1;
 657             }
 658         }
 659         if (cr1 == ENC_CODERANGE_7BIT)
 660             return enc2;
 661     }
 662     return 0;
 663 }
 664
 665 void
 666 rb_enc_copy(VALUE obj1, VALUE obj2)
 667 {
 668     rb_enc_associate_index(obj1, rb_enc_get_index(obj2));
 669 }
 670
 671
 672 /*
 673  *  call-seq:
 674  *     obj.encoding   => encoding
 675  *
 676  *  Returns the Encoding object that represents the encoding of obj.
 677  */
 678
 679 VALUE
 680 rb_obj_encoding(VALUE obj)
 681 {
 682     rb_encoding *enc = rb_enc_get(obj);
 683     if (!enc) {
 684         rb_raise(rb_eTypeError, "unknown encoding");
 685     }
 686     return rb_enc_from_encoding(enc);
 687 }
 688
 689 int
 690 rb_enc_mbclen(const char *p, const char *e, rb_encoding *enc)
 691 {
 692     int n = ONIGENC_PRECISE_MBC_ENC_LEN(enc, (UChar*)p, (UChar*)e);
 693     if (MBCLEN_CHARFOUND_P(n) && MBCLEN_CHARFOUND_LEN(n) <= e-p)
 694         return MBCLEN_CHARFOUND_LEN(n);
 695     else {
 696         int min = rb_enc_mbminlen(enc);
 697         return min <= e-p ? min : e-p;
 698     }
 699 }
 700
 701 int
 702 rb_enc_precise_mbclen(const char *p, const char *e, rb_encoding *enc)
 703 {
 704     int n;
 705     if (e <= p)
 706         return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(1);
 707     n = ONIGENC_PRECISE_MBC_ENC_LEN(enc, (UChar*)p, (UChar*)e);
 708     if (e-p < n)
 709         return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(n-(e-p));
 710     return n;
 711 }
 712
 713 int
 714 rb_enc_ascget(const char *p, const char *e, int *len, rb_encoding *enc)
 715 {
 716     int c, l;
 717     if (e <= p)
 718         return -1;
 719     if (rb_enc_asciicompat(enc)) {
 720         c = (unsigned char)*p;
 721         if (!ISASCII(c))
 722             return -1;
 723         if (len) *len = 1;
 724         return c;
 725     }
 726     l = rb_enc_precise_mbclen(p, e, enc);
 727     if (!MBCLEN_CHARFOUND_P(l))
 728         return -1;
 729     c = rb_enc_mbc_to_codepoint(p, e, enc);
 730     if (!rb_enc_isascii(c, enc))
 731         return -1;
 732     if (len) *len = l;
 733     return c;
 734 }
 735
 736 int
 737 rb_enc_codepoint(const char *p, const char *e, rb_encoding *enc)
 738 {
 739     int r;
 740     if (e <= p)
 741         rb_raise(rb_eArgError, "empty string");
 742     r = rb_enc_precise_mbclen(p, e, enc);
 743     if (MBCLEN_CHARFOUND_P(r))
 744         return rb_enc_mbc_to_codepoint(p, e, enc);
 745     else
 746         rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(enc));
 747 }
 748
 749 int
 750 rb_enc_codelen(int c, rb_encoding *enc)
 751 {
 752     int n = ONIGENC_CODE_TO_MBCLEN(enc,c);
 753     if (n == 0) {
 754         rb_raise(rb_eArgError, "invalid codepoint 0x%x in %s", c, rb_enc_name(enc));
 755     }
 756     return n;
 757 }
 758
 759 int
 760 rb_enc_toupper(int c, rb_encoding *enc)
 761 {
 762     return (ONIGENC_IS_ASCII_CODE(c)?ONIGENC_ASCII_CODE_TO_UPPER_CASE(c):(c));
 763 }
 764
 765 int
 766 rb_enc_tolower(int c, rb_encoding *enc)
 767 {
 768     return (ONIGENC_IS_ASCII_CODE(c)?ONIGENC_ASCII_CODE_TO_LOWER_CASE(c):(c));
 769 }
 770
 771 /*
 772  * call-seq:
 773  *   enc.inspect => string
 774  *
 775  * Returns a string which represents the encoding for programmers.
 776  *
 777  *   Encoding::UTF_8.inspect       #=> "#<Encoding:UTF-8>"
 778  *   Encoding::ISO_2022_JP.inspect #=> "#<Encoding:ISO-2022-JP (dummy)>"
 779  */
 780 static VALUE
 781 enc_inspect(VALUE self)
 782 {
 783     VALUE str = rb_sprintf("#<%s:%s%s>", rb_obj_classname(self),
 784                       rb_enc_name((rb_encoding*)DATA_PTR(self)),
 785                       (enc_dummy_p(self) ? " (dummy)" : ""));
 786     ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
 787     return str;
 788 }
 789
 790 /*
 791  * call-seq:
 792  *   enc.name => string
 793  *
 794  * Returns the name of the encoding.
 795  *
 796  *   Encoding::UTF_8.name       => "UTF-8"
 797  */
 798 static VALUE
 799 enc_name(VALUE self)
 800 {
 801     return rb_usascii_str_new2(rb_enc_name((rb_encoding*)DATA_PTR(self)));
 802 }
 803
 804 static VALUE
 805 enc_base_encoding(VALUE self)
 806 {
 807     rb_encoding *base = enc_table.list[must_encoding(self)].base;
 808     if (!base) return Qnil;
 809     return ENC_FROM_ENCODING(base);
 810 }
 811
 812 /*
 813  * call-seq:
 814  *   Encoding.list => [enc1, enc2, ...]
 815  *
 816  * Returns the list of loaded encodings.
 817  *
 818  *   Encoding.list
 819  *   => [#<Encoding:ASCII-8BIT>, #<Encoding:UTF-8>,
 820  *       #<Encoding:ISO-2022-JP (dummy)>]
 821  *
 822  *   Encoding.find("US-ASCII")
 823  *   => #<Encoding:US-ASCII>
 824  *
 825  *   Encoding.list
 826  *   => [#<Encoding:ASCII-8BIT>, #<Encoding:UTF-8>,
 827  *       #<Encoding:US-ASCII>, #<Encoding:ISO-2022-JP (dummy)>]
 828  *
 829  */
 830 static VALUE
 831 enc_list(VALUE klass)
 832 {
 833     VALUE ary = rb_ary_new2(0);
 834     rb_ary_replace(ary, rb_encoding_list);
 835     return ary;
 836 }
 837
 838 /*
 839  * call-seq:
 840  *   Encoding.find(string) => enc
 841  *   Encoding.find(symbol) => enc
 842  *
 843  * Search the encoding with specified <i>name</i>.
 844  * <i>name</i> should be a string or symbol.
 845  *
 846  *   Encoding.find("US-ASCII")  => #<Encoding:US-ASCII>
 847  *   Encoding.find(:Shift_JIS)  => #<Encoding:Shift_JIS>
 848  *
 849  */
 850 static VALUE
 851 enc_find(VALUE klass, VALUE enc)
 852 {
 853     return rb_enc_from_encoding(to_encoding(enc));
 854 }
 855
 856 /*
 857  * call-seq:
 858  *   Encoding.compatible?(str1, str2) => enc or nil
 859  *
 860  * Checks the compatibility of two strings.
 861  * If they are compatible, means concatenatable,
 862  * returns an encoding which the concatinated string will be.
 863  * If they are not compatible, nil is returned.
 864  *
 865  *   Encoding.compatible?("\xa1".force_encoding("iso-8859-1"), "b")
 866  *   => #<Encoding:ISO-8859-1>
 867  *
 868  *   Encoding.compatible?(
 869  *     "\xa1".force_encoding("iso-8859-1"),
 870  *     "\xa1\xa1".force_encoding("euc-jp"))
 871  *   => nil
 872  *
 873  */
 874 static VALUE
 875 enc_compatible_p(VALUE klass, VALUE str1, VALUE str2)
 876 {
 877     rb_encoding *enc;
 878
 879     if (!enc_capable(str1)) return Qnil;
 880     if (!enc_capable(str2)) return Qnil;
 881     enc = rb_enc_compatible(str1, str2);
 882     if (!enc) return Qnil;
 883     return rb_enc_from_encoding(enc);
 884 }
 885
 886 /* :nodoc: */
 887 static VALUE
 888 enc_dump(int argc, VALUE *argv, VALUE self)
 889 {
 890     rb_scan_args(argc, argv, "01", 0);
 891     return enc_name(self);
 892 }
 893
 894 /* :nodoc: */
 895 static VALUE
 896 enc_load(VALUE klass, VALUE str)
 897 {
 898     return enc_find(klass, str);
 899 }
 900
 901 rb_encoding *
 902 rb_ascii8bit_encoding(void)
 903 {
 904     if (!enc_table.list) {
 905         rb_enc_init();
 906     }
 907     return enc_table.list[ENCINDEX_ASCII].enc;
 908 }
 909
 910 int
 911 rb_ascii8bit_encindex(void)
 912 {
 913     return ENCINDEX_ASCII;
 914 }
 915
 916 rb_encoding *
 917 rb_utf8_encoding(void)
 918 {
 919     if (!enc_table.list) {
 920         rb_enc_init();
 921     }
 922     return enc_table.list[ENCINDEX_UTF_8].enc;
 923 }
 924
 925 int
 926 rb_utf8_encindex(void)
 927 {
 928     return ENCINDEX_UTF_8;
 929 }
 930
 931 rb_encoding *
 932 rb_usascii_encoding(void)
 933 {
 934     if (!enc_table.list) {
 935         rb_enc_init();
 936     }
 937     return enc_table.list[ENCINDEX_US_ASCII].enc;
 938 }
 939
 940 int
 941 rb_usascii_encindex(void)
 942 {
 943     return ENCINDEX_US_ASCII;
 944 }
 945
 946 rb_encoding *
 947 rb_locale_encoding(void)
 948 {
 949     VALUE charmap = rb_locale_charmap(rb_cEncoding);
 950     int idx;
 951
 952     if (NIL_P(charmap))
 953         return rb_usascii_encoding();
 954     else
 955         idx = rb_enc_find_index(StringValueCStr(charmap));
 956     if (idx < 0)
 957         return rb_ascii8bit_encoding();
 958
 959     return rb_enc_from_index(idx);
 960 }
 961
 962 rb_encoding *
 963 rb_filesystem_encoding(void)
 964 {
 965     static rb_encoding *enc;
 966     if (!enc) {
 967 #if defined _WIN32
 968         enc = rb_locale_encoding();
 969 #elif defined __APPLE__
 970         enc = rb_enc_find("UTF8-MAC");
 971 #else
 972         enc = rb_locale_encoding();
 973 #endif
 974     }
 975     return enc;
 976 }
 977
 978 static int default_external_index;
 979
 980 rb_encoding *
 981 rb_default_external_encoding(void)
 982 {
 983     return rb_enc_from_index(default_external_index);
 984 }
 985
 986 VALUE
 987 rb_enc_default_external(void)
 988 {
 989     return rb_enc_from_encoding(rb_default_external_encoding());
 990 }
 991
 992 /*
 993  * call-seq:
 994  *   Encoding.default_external => enc
 995  *
 996  * Returns default external encoding.
 997  *
 998  * It is initialized by the locale or -E option.
 999  */
1000 static VALUE
1001 get_default_external(VALUE klass)
1002 {
1003     return rb_enc_default_external();
1004 }
1005
1006 void
1007 rb_enc_set_default_external(VALUE encoding)
1008 {
1009     default_external_index = rb_enc_to_index(rb_to_encoding(encoding));
1010 }
1011
1012 /*
1013  * call-seq:
1014  *   Encoding.locale_charmap => string
1015  *
1016  * Returns the locale charmap name.
1017  *
1018  *   Debian GNU/Linux
1019  *     LANG=C
1020  *       Encoding.locale_charmap  => "ANSI_X3.4-1968"
1021  *     LANG=ja_JP.EUC-JP
1022  *       Encoding.locale_charmap  => "EUC-JP"
1023  *
1024  *   SunOS 5
1025  *     LANG=C
1026  *       Encoding.locale_charmap  => "646"
1027  *     LANG=ja
1028  *       Encoding.locale_charmap  => "eucJP"
1029  *
1030  */
1031 VALUE
1032 rb_locale_charmap(VALUE klass)
1033 {
1034 #if defined NO_LOCALE_CHARMAP
1035     return rb_usascii_str_new2("ASCII-8BIT");
1036 #elif defined HAVE_LANGINFO_H
1037     char *codeset;
1038     codeset = nl_langinfo(CODESET);
1039     return rb_usascii_str_new2(codeset);
1040 #elif defined _WIN32
1041     return rb_sprintf("CP%d", GetACP());
1042 #else
1043     return Qnil;
1044 #endif
1045 }
1046
1047 static void
1048 set_encoding_const(const char *name, rb_encoding *enc)
1049 {
1050     VALUE encoding = rb_enc_from_encoding(enc);
1051     char *s = (char *)name;
1052     int haslower = 0, hasupper = 0, valid = 0;
1053
1054     if (ISDIGIT(*s)) return;
1055     if (ISUPPER(*s)) {
1056         hasupper = 1;
1057         while (*++s && (ISALNUM(*s) || *s == '_')) {
1058             if (ISLOWER(*s)) haslower = 1;
1059         }
1060     }
1061     if (!*s) {
1062         valid = 1;
1063         rb_define_const(rb_cEncoding, name, encoding);
1064     }
1065     if (!valid || haslower) {
1066         int len = strlen(name) + 1;
1067         if (!haslower || !hasupper) {
1068             do {
1069                 if (ISLOWER(*s)) haslower = 1;
1070                 if (ISUPPER(*s)) hasupper = 1;
1071             } while (*++s && (!haslower || !hasupper));
1072         }
1073         MEMCPY(s = ALLOCA_N(char, len), name, char, len);
1074         name = s;
1075         if (!valid) {
1076             if (ISLOWER(*s)) *s = ONIGENC_ASCII_CODE_TO_UPPER_CASE((int)*s);
1077             for (; *s; ++s) {
1078                 if (!ISALNUM(*s)) *s = '_';
1079             }
1080             if (hasupper) {
1081                 rb_define_const(rb_cEncoding, name, encoding);
1082             }
1083         }
1084         if (haslower) {
1085             for (s = (char *)name; *s; ++s) {
1086                 if (ISLOWER(*s)) *s = ONIGENC_ASCII_CODE_TO_UPPER_CASE((int)*s);
1087             }
1088             rb_define_const(rb_cEncoding, name, encoding);
1089         }
1090     }
1091 }
1092
1093 static int
1094 rb_enc_name_list_i(st_data_t name, st_data_t idx, st_data_t arg)
1095 {
1096     VALUE ary = (VALUE)arg;
1097     VALUE str = rb_usascii_str_new2((char *)name);
1098     OBJ_FREEZE(str);
1099     rb_ary_push(ary, str);
1100     return ST_CONTINUE;
1101 }
1102
1103 /*
1104  * call-seq:
1105  *   Encoding.name_list => ["enc1", "enc2", ...]
1106  *
1107  * Returns the list of available encoding names.
1108  *
1109  *   Encoding.name_list
1110  *   => ["US-ASCII", "ASCII-8BIT", "UTF-8",
1111  *       "ISO-8859-1", "Shift_JIS", "EUC-JP",
1112  *       "Windows-31J",
1113  *       "BINARY", "CP932", "eucJP"]
1114  *
1115  * This list doesn't include dummy encodings.
1116  *
1117  */
1118
1119 static VALUE
1120 rb_enc_name_list(VALUE klass)
1121 {
1122     VALUE ary = rb_ary_new2(enc_table.names->num_entries);
1123     st_foreach(enc_table.names, rb_enc_name_list_i, (st_data_t)ary);
1124     return ary;
1125 }
1126
1127 static int
1128 rb_enc_aliases_enc_i(st_data_t name, st_data_t orig, st_data_t arg)
1129 {
1130     VALUE *p = (VALUE *)arg;
1131     VALUE aliases = p[0], ary = p[1];
1132     int idx = (int)orig;
1133     VALUE key, str = rb_ary_entry(ary, idx);
1134
1135     if (NIL_P(str)) {
1136         rb_encoding *enc = rb_enc_from_index(idx);
1137
1138         if (STRCASECMP((char*)name, rb_enc_name(enc)) == 0) {
1139             return ST_CONTINUE;
1140         }
1141         str = rb_usascii_str_new2(rb_enc_name(enc));
1142         OBJ_FREEZE(str);
1143         rb_ary_store(ary, idx, str);
1144     }
1145     key = rb_usascii_str_new2((char *)name);
1146     OBJ_FREEZE(key);
1147     rb_hash_aset(aliases, key, str);
1148     return ST_CONTINUE;
1149 }
1150
1151 /*
1152  * call-seq:
1153  *   Encoding.aliases => {"alias1" => "orig1", "alias2" => "orig2", ...}
1154  *
1155  * Returns the hash of available encoding alias and original encoding name.
1156  *
1157  *   Encoding.aliases
1158  *   => {"BINARY"=>"ASCII-8BIT", "ASCII"=>"US-ASCII", "ANSI_X3.4-1986"=>"US-ASCII",
1159  *       "SJIS"=>"Shift_JIS", "eucJP"=>"EUC-JP", "CP932"=>"Windows-31J"}
1160  *
1161  */
1162
1163 static VALUE
1164 rb_enc_aliases(VALUE klass)
1165 {
1166     VALUE aliases[2];
1167     aliases[0] = rb_hash_new();
1168     aliases[1] = rb_ary_new();
1169     st_foreach(enc_table.names, rb_enc_aliases_enc_i, (st_data_t)aliases);
1170     return aliases[0];
1171 }
1172
1173 void
1174 Init_Encoding(void)
1175 {
1176 #undef rb_intern
1177     VALUE list;
1178     int i;
1179
1180     id_base_encoding = rb_intern("#base_encoding");
1181
1182     rb_cEncoding = rb_define_class("Encoding", rb_cObject);
1183     rb_undef_alloc_func(rb_cEncoding);
1184     rb_define_method(rb_cEncoding, "to_s", enc_name, 0);
1185     rb_define_method(rb_cEncoding, "inspect", enc_inspect, 0);
1186     rb_define_method(rb_cEncoding, "name", enc_name, 0);
1187     rb_define_method(rb_cEncoding, "base_encoding", enc_base_encoding, 0);
1188     rb_define_method(rb_cEncoding, "dummy?", enc_dummy_p, 0);
1189     rb_define_singleton_method(rb_cEncoding, "list", enc_list, 0);
1190     rb_define_singleton_method(rb_cEncoding, "name_list", rb_enc_name_list, 0);
1191     rb_define_singleton_method(rb_cEncoding, "aliases", rb_enc_aliases, 0);
1192     rb_define_singleton_method(rb_cEncoding, "find", enc_find, 1);
1193     rb_define_singleton_method(rb_cEncoding, "compatible?", enc_compatible_p, 2);
1194
1195     rb_define_method(rb_cEncoding, "_dump", enc_dump, -1);
1196     rb_define_singleton_method(rb_cEncoding, "_load", enc_load, 1);
1197
1198     rb_define_singleton_method(rb_cEncoding, "default_external", get_default_external, 0);
1199     rb_define_singleton_method(rb_cEncoding, "locale_charmap", rb_locale_charmap, 0);
1200
1201     rb_gc_register_address(&rb_encoding_list);
1202     list = rb_ary_new2(enc_table.count);
1203     RBASIC(list)->klass = 0;
1204     rb_encoding_list = list;
1205     for (i = 0; i < enc_table.count; ++i) {
1206         rb_ary_push(list, enc_new(enc_table.list[i].enc));
1207     }
1208 }
1209
1210 /* locale insensitive functions */
1211
1212 #define ctype_test(c, ctype) \
1213     (rb_isascii(c) && ONIGENC_IS_ASCII_CODE_CTYPE((c), ctype))
1214
1215 int rb_isalnum(int c) { return ctype_test(c, ONIGENC_CTYPE_ALNUM); }
1216 int rb_isalpha(int c) { return ctype_test(c, ONIGENC_CTYPE_ALPHA); }
1217 int rb_isblank(int c) { return ctype_test(c, ONIGENC_CTYPE_BLANK); }
1218 int rb_iscntrl(int c) { return ctype_test(c, ONIGENC_CTYPE_CNTRL); }
1219 int rb_isdigit(int c) { return ctype_test(c, ONIGENC_CTYPE_DIGIT); }
1220 int rb_isgraph(int c) { return ctype_test(c, ONIGENC_CTYPE_GRAPH); }
1221 int rb_islower(int c) { return ctype_test(c, ONIGENC_CTYPE_LOWER); }
1222 int rb_isprint(int c) { return ctype_test(c, ONIGENC_CTYPE_PRINT); }
1223 int rb_ispunct(int c) { return ctype_test(c, ONIGENC_CTYPE_PUNCT); }
1224 int rb_isspace(int c) { return ctype_test(c, ONIGENC_CTYPE_SPACE); }
1225 int rb_isupper(int c) { return ctype_test(c, ONIGENC_CTYPE_UPPER); }
1226 int rb_isxdigit(int c) { return ctype_test(c, ONIGENC_CTYPE_XDIGIT); }
1227
1228 int
1229 rb_tolower(int c)
1230 {
1231     return rb_isascii(c) ? ONIGENC_ASCII_CODE_TO_LOWER_CASE(c) : c;
1232 }
1233
1234 int
1235 rb_toupper(int c)
1236 {
1237     return rb_isascii(c) ? ONIGENC_ASCII_CODE_TO_UPPER_CASE(c) : c;
1238 }
1239